diff mbox series

[v6,05/16] iomap: Add async buffered write support

Message ID 20220526173840.578265-6-shr@fb.com (mailing list archive)
State Deferred, archived
Headers show
Series io-uring/xfs: support async buffered writes | expand

Commit Message

Stefan Roesch May 26, 2022, 5:38 p.m. UTC
This adds async buffered write support to iomap.

This replaces the call to balance_dirty_pages_ratelimited() with the
call to balance_dirty_pages_ratelimited_flags. This allows to specify if
the write request is async or not.

In addition this also moves the above function call to the beginning of
the function. If the function call is at the end of the function and the
decision is made to throttle writes, then there is no request that
io-uring can wait on. By moving it to the beginning of the function, the
write request is not issued, but returns -EAGAIN instead. io-uring will
punt the request and process it in the io-worker.

By moving the function call to the beginning of the function, the write
throttling will happen one page later.

Signed-off-by: Stefan Roesch <shr@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/iomap/buffered-io.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

Comments

Darrick J. Wong May 26, 2022, 6:42 p.m. UTC | #1
On Thu, May 26, 2022 at 10:38:29AM -0700, Stefan Roesch wrote:
> This adds async buffered write support to iomap.
> 
> This replaces the call to balance_dirty_pages_ratelimited() with the
> call to balance_dirty_pages_ratelimited_flags. This allows to specify if
> the write request is async or not.
> 
> In addition this also moves the above function call to the beginning of
> the function. If the function call is at the end of the function and the
> decision is made to throttle writes, then there is no request that
> io-uring can wait on. By moving it to the beginning of the function, the
> write request is not issued, but returns -EAGAIN instead. io-uring will
> punt the request and process it in the io-worker.
> 
> By moving the function call to the beginning of the function, the write
> throttling will happen one page later.

It does?  I would have thought that moving it before iomap_write_begin
call would make the throttling happen one page sooner?  Sorry if I'm
being dense here...

> Signed-off-by: Stefan Roesch <shr@fb.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
>  fs/iomap/buffered-io.c | 31 +++++++++++++++++++++++++++----
>  1 file changed, 27 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d6ddc54e190e..2281667646d2 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -559,6 +559,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  	loff_t block_size = i_blocksize(iter->inode);
>  	loff_t block_start = round_down(pos, block_size);
>  	loff_t block_end = round_up(pos + len, block_size);
> +	unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
>  	size_t from = offset_in_folio(folio, pos), to = from + len;
>  	size_t poff, plen;
>  
> @@ -567,6 +568,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  	folio_clear_error(folio);
>  
>  	iop = iomap_page_create(iter->inode, folio, iter->flags);
> +	if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
> +		return -EAGAIN;
>  
>  	do {
>  		iomap_adjust_read_range(iter->inode, folio, &block_start,
> @@ -584,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  				return -EIO;
>  			folio_zero_segments(folio, poff, from, to, poff + plen);
>  		} else {
> -			int status = iomap_read_folio_sync(block_start, folio,
> +			int status;
> +
> +			if (iter->flags & IOMAP_NOWAIT)
> +				return -EAGAIN;
> +
> +			status = iomap_read_folio_sync(block_start, folio,
>  					poff, plen, srcmap);
>  			if (status)
>  				return status;
> @@ -613,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
>  	int status = 0;
>  
> +	if (iter->flags & IOMAP_NOWAIT)
> +		fgp |= FGP_NOWAIT;

FGP_NOWAIT can cause __filemap_get_folio to return a NULL folio, which
makes iomap_write_begin return -ENOMEM.  If nothing has been written
yet, won't that cause the ENOMEM to escape to userspace?  Why do we want
that instead of EAGAIN?

> +
>  	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
>  	if (srcmap != &iter->iomap)
>  		BUG_ON(pos + len > srcmap->offset + srcmap->length);
> @@ -750,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  	loff_t pos = iter->pos;
>  	ssize_t written = 0;
>  	long status = 0;
> +	struct address_space *mapping = iter->inode->i_mapping;
> +	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
>  
>  	do {
>  		struct folio *folio;
> @@ -762,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
>  						iov_iter_count(i));
>  again:
> +		status = balance_dirty_pages_ratelimited_flags(mapping,
> +							       bdp_flags);
> +		if (unlikely(status))
> +			break;
> +
>  		if (bytes > length)
>  			bytes = length;
>  
> @@ -770,6 +788,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  		 * Otherwise there's a nasty deadlock on copying from the
>  		 * same page as we're writing to, without it being marked
>  		 * up-to-date.
> +		 *
> +		 * For async buffered writes the assumption is that the user
> +		 * page has already been faulted in. This can be optimized by
> +		 * faulting the user page in the prepare phase of io-uring.

I don't think this pattern is unique to async writes with io_uring --
gfs2 also wanted this "try as many pages as you can until you hit a page
fault and then return a short write to caller so it can fault in the
rest" behavior.

--D

>  		 */
>  		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
>  			status = -EFAULT;
> @@ -781,7 +803,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  			break;
>  
>  		page = folio_file_page(folio, pos >> PAGE_SHIFT);
> -		if (mapping_writably_mapped(iter->inode->i_mapping))
> +		if (mapping_writably_mapped(mapping))
>  			flush_dcache_page(page);
>  
>  		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
> @@ -806,8 +828,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  		pos += status;
>  		written += status;
>  		length -= status;
> -
> -		balance_dirty_pages_ratelimited(iter->inode->i_mapping);
>  	} while (iov_iter_count(i) && length);
>  
>  	return written ? written : status;
> @@ -825,6 +845,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
>  	};
>  	int ret;
>  
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		iter.flags |= IOMAP_NOWAIT;
> +
>  	while ((ret = iomap_iter(&iter, ops)) > 0)
>  		iter.processed = iomap_write_iter(&iter, i);
>  	if (iter.pos == iocb->ki_pos)
> -- 
> 2.30.2
>
Dave Chinner May 26, 2022, 10:37 p.m. UTC | #2
On Thu, May 26, 2022 at 10:38:29AM -0700, Stefan Roesch wrote:
> This adds async buffered write support to iomap.
> 
> This replaces the call to balance_dirty_pages_ratelimited() with the
> call to balance_dirty_pages_ratelimited_flags. This allows to specify if
> the write request is async or not.
> 
> In addition this also moves the above function call to the beginning of
> the function. If the function call is at the end of the function and the
> decision is made to throttle writes, then there is no request that
> io-uring can wait on. By moving it to the beginning of the function, the
> write request is not issued, but returns -EAGAIN instead. io-uring will
> punt the request and process it in the io-worker.
> 
> By moving the function call to the beginning of the function, the write
> throttling will happen one page later.

Won't it happen one page sooner? I.e. on single page writes we'll
end up throttling *before* we dirty the page, not *after* we dirty
the page. IOWs, we can't wait for the page that we just dirtied to
be cleaned to make progress and so this now makes the loop dependent
on pages dirtied by other writers being cleaned to guarantee
forwards progress?

That seems like a subtle but quite significant change of
algorithm...

> Signed-off-by: Stefan Roesch <shr@fb.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d6ddc54e190e..2281667646d2 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -559,6 +559,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  	loff_t block_size = i_blocksize(iter->inode);
>  	loff_t block_start = round_down(pos, block_size);
>  	loff_t block_end = round_up(pos + len, block_size);
> +	unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
>  	size_t from = offset_in_folio(folio, pos), to = from + len;
>  	size_t poff, plen;
>  
> @@ -567,6 +568,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  	folio_clear_error(folio);
>  
>  	iop = iomap_page_create(iter->inode, folio, iter->flags);
> +	if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
> +		return -EAGAIN;
>  

Hmmm. I see a what looks to be an undesirable pattern here...

1. Memory allocation failure here on the second page of a write.

> @@ -806,8 +828,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  		pos += status;
>  		written += status;
>  		length -= status;
> -
> -		balance_dirty_pages_ratelimited(iter->inode->i_mapping);
>  	} while (iov_iter_count(i) && length);
>  
>  	return written ? written : status;

2. we break and return 4kB from the first page copied.

> @@ -825,6 +845,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
>  	};
>  	int ret;
>  
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		iter.flags |= IOMAP_NOWAIT;
> +
>  	while ((ret = iomap_iter(&iter, ops)) > 0)
>  		iter.processed = iomap_write_iter(&iter, i);

3. This sets iter.processed = 4kB, and we call iomap_iter() again.
This sees iter.processed > 0 and there's still more to write, so
it returns 1, and go around the loop again.

Hence spurious memory allocation failures in the IOMAP_NOWAIT will
not cause this buffered write loop to exit. Worst case, we fail
allocation on every second __iomap_write_begin() call and so the
write takes much longer and consume lots more CPU hammering memory
alocation because no single memory allocation will cause the write
to return a short write to the caller.

This seems undesirable to me. If we are failing memory allocations,
we need to back off, not hammer memory allocation harder without
allowing reclaim to make progress...

Cheers,

Dave.
Jan Kara May 27, 2022, 8:42 a.m. UTC | #3
On Fri 27-05-22 08:37:05, Dave Chinner wrote:
> On Thu, May 26, 2022 at 10:38:29AM -0700, Stefan Roesch wrote:
> > This adds async buffered write support to iomap.
> > 
> > This replaces the call to balance_dirty_pages_ratelimited() with the
> > call to balance_dirty_pages_ratelimited_flags. This allows to specify if
> > the write request is async or not.
> > 
> > In addition this also moves the above function call to the beginning of
> > the function. If the function call is at the end of the function and the
> > decision is made to throttle writes, then there is no request that
> > io-uring can wait on. By moving it to the beginning of the function, the
> > write request is not issued, but returns -EAGAIN instead. io-uring will
> > punt the request and process it in the io-worker.
> > 
> > By moving the function call to the beginning of the function, the write
> > throttling will happen one page later.
> 
> Won't it happen one page sooner? I.e. on single page writes we'll
> end up throttling *before* we dirty the page, not *after* we dirty
> the page. IOWs, we can't wait for the page that we just dirtied to
> be cleaned to make progress and so this now makes the loop dependent
> on pages dirtied by other writers being cleaned to guarantee
> forwards progress?
> 
> That seems like a subtle but quite significant change of
> algorithm...

So I'm convinced the difference will be pretty much in the noise because of
how many dirty pages there have to be to even start throttling processes
but some more arguments are:

* we ratelimit calls to balance_dirty_pages() based on number of pages
  dirtied by the current process in balance_dirty_pages_ratelimited()

* balance_dirty_pages() uses number of pages dirtied by the current process
  to decide about the delay.

So the only situation where I could see this making a difference would be
if dirty limit is a handful of pages and even there I have hard time to see
how exactly. So I'm ok with the change and in the case we see it causes
problems somewhere, we'll think how to fix it based on the exact scenario.

I guess the above two points are the reason why Stefan writes about throttling
one page later because we count only number of pages dirtied until this
moment so the page dirtied by this iteration of loop in iomap_write_iter()
will get reflected only by the call to balance_dirty_pages_ratelimited() in
the next iteration (or the next call to iomap_write_iter()).

								Honza
Dave Chinner May 27, 2022, 10:52 p.m. UTC | #4
On Fri, May 27, 2022 at 10:42:03AM +0200, Jan Kara wrote:
> On Fri 27-05-22 08:37:05, Dave Chinner wrote:
> > On Thu, May 26, 2022 at 10:38:29AM -0700, Stefan Roesch wrote:
> > > This adds async buffered write support to iomap.
> > > 
> > > This replaces the call to balance_dirty_pages_ratelimited() with the
> > > call to balance_dirty_pages_ratelimited_flags. This allows to specify if
> > > the write request is async or not.
> > > 
> > > In addition this also moves the above function call to the beginning of
> > > the function. If the function call is at the end of the function and the
> > > decision is made to throttle writes, then there is no request that
> > > io-uring can wait on. By moving it to the beginning of the function, the
> > > write request is not issued, but returns -EAGAIN instead. io-uring will
> > > punt the request and process it in the io-worker.
> > > 
> > > By moving the function call to the beginning of the function, the write
> > > throttling will happen one page later.
> > 
> > Won't it happen one page sooner? I.e. on single page writes we'll
> > end up throttling *before* we dirty the page, not *after* we dirty
> > the page. IOWs, we can't wait for the page that we just dirtied to
> > be cleaned to make progress and so this now makes the loop dependent
> > on pages dirtied by other writers being cleaned to guarantee
> > forwards progress?
> > 
> > That seems like a subtle but quite significant change of
> > algorithm...
> 
> So I'm convinced the difference will be pretty much in the noise because of
> how many dirty pages there have to be to even start throttling processes
> but some more arguments are:
> 
> * we ratelimit calls to balance_dirty_pages() based on number of pages
>   dirtied by the current process in balance_dirty_pages_ratelimited()
> 
> * balance_dirty_pages() uses number of pages dirtied by the current process
>   to decide about the delay.
> 
> So the only situation where I could see this making a difference would be
> if dirty limit is a handful of pages and even there I have hard time to see
> how exactly.

That's kinda what worries me - we do see people winding the dirty
thresholds way down to work around various niche problems with
dirty page buildup.

We also have small extra accounting overhead for cases where we've
stacked layers to so the lower layers don't dirty throttle before
the higher layer. If the lower layer throttles first, then the
higher layer can't clean pages and we can deadlock.

Those are the sorts of subtle, niche situations where I worry that
the subtle "throttle first, write second" change could manifest...

Cheers,

Dave.
Christoph Hellwig May 31, 2022, 6:58 a.m. UTC | #5
I'll leave the throttling algorithm discussion to the experts, but
from the pure code POV this looks good to me:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Jan Kara May 31, 2022, 7:55 a.m. UTC | #6
On Sat 28-05-22 08:52:40, Dave Chinner wrote:
> On Fri, May 27, 2022 at 10:42:03AM +0200, Jan Kara wrote:
> > On Fri 27-05-22 08:37:05, Dave Chinner wrote:
> > > On Thu, May 26, 2022 at 10:38:29AM -0700, Stefan Roesch wrote:
> > > > This adds async buffered write support to iomap.
> > > > 
> > > > This replaces the call to balance_dirty_pages_ratelimited() with the
> > > > call to balance_dirty_pages_ratelimited_flags. This allows to specify if
> > > > the write request is async or not.
> > > > 
> > > > In addition this also moves the above function call to the beginning of
> > > > the function. If the function call is at the end of the function and the
> > > > decision is made to throttle writes, then there is no request that
> > > > io-uring can wait on. By moving it to the beginning of the function, the
> > > > write request is not issued, but returns -EAGAIN instead. io-uring will
> > > > punt the request and process it in the io-worker.
> > > > 
> > > > By moving the function call to the beginning of the function, the write
> > > > throttling will happen one page later.
> > > 
> > > Won't it happen one page sooner? I.e. on single page writes we'll
> > > end up throttling *before* we dirty the page, not *after* we dirty
> > > the page. IOWs, we can't wait for the page that we just dirtied to
> > > be cleaned to make progress and so this now makes the loop dependent
> > > on pages dirtied by other writers being cleaned to guarantee
> > > forwards progress?
> > > 
> > > That seems like a subtle but quite significant change of
> > > algorithm...
> > 
> > So I'm convinced the difference will be pretty much in the noise because of
> > how many dirty pages there have to be to even start throttling processes
> > but some more arguments are:
> > 
> > * we ratelimit calls to balance_dirty_pages() based on number of pages
> >   dirtied by the current process in balance_dirty_pages_ratelimited()
> > 
> > * balance_dirty_pages() uses number of pages dirtied by the current process
> >   to decide about the delay.
> > 
> > So the only situation where I could see this making a difference would be
> > if dirty limit is a handful of pages and even there I have hard time to see
> > how exactly.
> 
> That's kinda what worries me - we do see people winding the dirty
> thresholds way down to work around various niche problems with
> dirty page buildup.
> 
> We also have small extra accounting overhead for cases where we've
> stacked layers to so the lower layers don't dirty throttle before
> the higher layer. If the lower layer throttles first, then the
> higher layer can't clean pages and we can deadlock.
> 
> Those are the sorts of subtle, niche situations where I worry that
> the subtle "throttle first, write second" change could manifest...

Well, I'd think about the change more as "write first, throttle on next
write" because balance_dirty_pages_ratelimited() throttles based on the
number of pages dirtied until the moment it is called. So first invocation
of balance_dirty_pages_ratelimited() will not do anything because
current->nr_dirtied will be zero. So effectively we always let the process
run longer than before the change before we throttle it. But number of
dirtied pages until we throttle should be the same for both cases.

								Honza
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d6ddc54e190e..2281667646d2 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -559,6 +559,7 @@  static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 	loff_t block_size = i_blocksize(iter->inode);
 	loff_t block_start = round_down(pos, block_size);
 	loff_t block_end = round_up(pos + len, block_size);
+	unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
 	size_t from = offset_in_folio(folio, pos), to = from + len;
 	size_t poff, plen;
 
@@ -567,6 +568,8 @@  static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 	folio_clear_error(folio);
 
 	iop = iomap_page_create(iter->inode, folio, iter->flags);
+	if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
+		return -EAGAIN;
 
 	do {
 		iomap_adjust_read_range(iter->inode, folio, &block_start,
@@ -584,7 +587,12 @@  static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 				return -EIO;
 			folio_zero_segments(folio, poff, from, to, poff + plen);
 		} else {
-			int status = iomap_read_folio_sync(block_start, folio,
+			int status;
+
+			if (iter->flags & IOMAP_NOWAIT)
+				return -EAGAIN;
+
+			status = iomap_read_folio_sync(block_start, folio,
 					poff, plen, srcmap);
 			if (status)
 				return status;
@@ -613,6 +621,9 @@  static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
 	int status = 0;
 
+	if (iter->flags & IOMAP_NOWAIT)
+		fgp |= FGP_NOWAIT;
+
 	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
 	if (srcmap != &iter->iomap)
 		BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -750,6 +761,8 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 	loff_t pos = iter->pos;
 	ssize_t written = 0;
 	long status = 0;
+	struct address_space *mapping = iter->inode->i_mapping;
+	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
 
 	do {
 		struct folio *folio;
@@ -762,6 +775,11 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		bytes = min_t(unsigned long, PAGE_SIZE - offset,
 						iov_iter_count(i));
 again:
+		status = balance_dirty_pages_ratelimited_flags(mapping,
+							       bdp_flags);
+		if (unlikely(status))
+			break;
+
 		if (bytes > length)
 			bytes = length;
 
@@ -770,6 +788,10 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
+		 *
+		 * For async buffered writes the assumption is that the user
+		 * page has already been faulted in. This can be optimized by
+		 * faulting the user page in the prepare phase of io-uring.
 		 */
 		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
 			status = -EFAULT;
@@ -781,7 +803,7 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 			break;
 
 		page = folio_file_page(folio, pos >> PAGE_SHIFT);
-		if (mapping_writably_mapped(iter->inode->i_mapping))
+		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
 		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
@@ -806,8 +828,6 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		pos += status;
 		written += status;
 		length -= status;
-
-		balance_dirty_pages_ratelimited(iter->inode->i_mapping);
 	} while (iov_iter_count(i) && length);
 
 	return written ? written : status;
@@ -825,6 +845,9 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 	};
 	int ret;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		iter.flags |= IOMAP_NOWAIT;
+
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.processed = iomap_write_iter(&iter, i);
 	if (iter.pos == iocb->ki_pos)