diff mbox series

[12/13] iomap: submit ioends immediately

Message ID 20231126124720.1249310-13-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/13] iomap: clear the per-folio dirty bits on all writeback failures | expand

Commit Message

Christoph Hellwig Nov. 26, 2023, 12:47 p.m. UTC
Currently the writeback code delays submitting fill ioends until we
reach the end of the folio.  The reason for that is that otherwise
the end I/O handler could clear the writeback bit before we've even
finished submitting all I/O for the folio.

Add a bias to ifs->write_bytes_pending while we are submitting I/O
for a folio so that it never reaches zero until all I/O is completed
to prevent the early writeback bit clearing, and remove the now
superfluous submit_list.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/buffered-io.c | 157 ++++++++++++++++++++---------------------
 1 file changed, 75 insertions(+), 82 deletions(-)

Comments

Darrick J. Wong Nov. 29, 2023, 5:14 a.m. UTC | #1
On Sun, Nov 26, 2023 at 01:47:19PM +0100, Christoph Hellwig wrote:
> Currently the writeback code delays submitting fill ioends until we
> reach the end of the folio.  The reason for that is that otherwise
> the end I/O handler could clear the writeback bit before we've even
> finished submitting all I/O for the folio.
> 
> Add a bias to ifs->write_bytes_pending while we are submitting I/O
> for a folio so that it never reaches zero until all I/O is completed
> to prevent the early writeback bit clearing, and remove the now
> superfluous submit_list.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok, we'll see what happens in the last patch...
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/iomap/buffered-io.c | 157 ++++++++++++++++++++---------------------
>  1 file changed, 75 insertions(+), 82 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 9f223820f60d22..a01b0686e7c8a0 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1620,30 +1620,34 @@ static void iomap_writepage_end_bio(struct bio *bio)
>   * Submit the final bio for an ioend.
>   *
>   * If @error is non-zero, it means that we have a situation where some part of
> - * the submission process has failed after we've marked pages for writeback
> - * and unlocked them.  In this situation, we need to fail the bio instead of
> - * submitting it.  This typically only happens on a filesystem shutdown.
> + * the submission process has failed after we've marked pages for writeback.
> + * We cannot cancel ioend directly in that case, so call the bio end I/O handler
> + * with the error status here to run the normal I/O completion handler to clear
> + * the writeback bit and let the file system proess the errors.
>   */
> -static int
> -iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
> -		int error)
> +static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
>  {
> +	if (!wpc->ioend)
> +		return error;
> +
> +	/*
> +	 * Let the file systems prepare the I/O submission and hook in an I/O
> +	 * comletion handler.  This also needs to happen in case after a
> +	 * failure happened so that the file system end I/O handler gets called
> +	 * to clean up.
> +	 */
>  	if (wpc->ops->prepare_ioend)
> -		error = wpc->ops->prepare_ioend(ioend, error);
> +		error = wpc->ops->prepare_ioend(wpc->ioend, error);
> +
>  	if (error) {
> -		/*
> -		 * If we're failing the IO now, just mark the ioend with an
> -		 * error and finish it.  This will run IO completion immediately
> -		 * as there is only one reference to the ioend at this point in
> -		 * time.
> -		 */
> -		ioend->io_bio.bi_status = errno_to_blk_status(error);
> -		bio_endio(&ioend->io_bio);
> -		return error;
> +		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
> +		bio_endio(&wpc->ioend->io_bio);
> +	} else {
> +		submit_bio(&wpc->ioend->io_bio);
>  	}
>  
> -	submit_bio(&ioend->io_bio);
> -	return 0;
> +	wpc->ioend = NULL;
> +	return error;
>  }
>  
>  static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
> @@ -1698,19 +1702,28 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset)
>  /*
>   * Test to see if we have an existing ioend structure that we could append to
>   * first; otherwise finish off the current ioend and start another.
> + *
> + * If a new ioend is created and cached, the old ioend is submitted to the block
> + * layer instantly.  Batching optimisations are provided by higher level block
> + * plugging.
> + *
> + * At the end of a writeback pass, there will be a cached ioend remaining on the
> + * writepage context that the caller will need to submit.
>   */
> -static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
> +static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct folio *folio,
> -		struct inode *inode, loff_t pos, struct list_head *iolist)
> +		struct inode *inode, loff_t pos)
>  {
>  	struct iomap_folio_state *ifs = folio->private;
>  	unsigned len = i_blocksize(inode);
>  	size_t poff = offset_in_folio(folio, pos);
> +	int error;
>  
>  	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
>  new_ioend:
> -		if (wpc->ioend)
> -			list_add(&wpc->ioend->io_list, iolist);
> +		error = iomap_submit_ioend(wpc, 0);
> +		if (error)
> +			return error;
>  		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
>  	}
>  
> @@ -1721,12 +1734,12 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
>  		atomic_add(len, &ifs->write_bytes_pending);
>  	wpc->ioend->io_size += len;
>  	wbc_account_cgroup_owner(wbc, &folio->page, len);
> +	return 0;
>  }
>  
>  static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct folio *folio,
> -		struct inode *inode, u64 pos, unsigned *count,
> -		struct list_head *submit_list)
> +		struct inode *inode, u64 pos, unsigned *count)
>  {
>  	int error;
>  
> @@ -1743,7 +1756,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
>  	case IOMAP_HOLE:
>  		break;
>  	default:
> -		iomap_add_to_ioend(wpc, wbc, folio, inode, pos, submit_list);
> +		iomap_add_to_ioend(wpc, wbc, folio, inode, pos);
>  		(*count)++;
>  	}
>  
> @@ -1820,35 +1833,21 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
>  	return true;
>  }
>  
> -/*
> - * We implement an immediate ioend submission policy here to avoid needing to
> - * chain multiple ioends and hence nest mempool allocations which can violate
> - * the forward progress guarantees we need to provide. The current ioend we're
> - * adding blocks to is cached in the writepage context, and if the new block
> - * doesn't append to the cached ioend, it will create a new ioend and cache that
> - * instead.
> - *
> - * If a new ioend is created and cached, the old ioend is returned and queued
> - * locally for submission once the entire page is processed or an error has been
> - * detected.  While ioends are submitted immediately after they are completed,
> - * batching optimisations are provided by higher level block plugging.
> - *
> - * At the end of a writeback pass, there will be a cached ioend remaining on the
> - * writepage context that the caller will need to submit.
> - */
>  static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct folio *folio)
>  {
>  	struct iomap_folio_state *ifs = folio->private;
>  	struct inode *inode = folio->mapping->host;
> -	struct iomap_ioend *ioend, *next;
>  	unsigned len = i_blocksize(inode);
>  	unsigned nblocks = i_blocks_per_folio(inode, folio);
>  	u64 pos = folio_pos(folio);
>  	u64 end_pos = pos + folio_size(folio);
>  	unsigned count = 0;
>  	int error = 0, i;
> -	LIST_HEAD(submit_list);
> +
> +	WARN_ON_ONCE(!folio_test_locked(folio));
> +	WARN_ON_ONCE(folio_test_dirty(folio));
> +	WARN_ON_ONCE(folio_test_writeback(folio));
>  
>  	trace_iomap_writepage(inode, pos, folio_size(folio));
>  
> @@ -1858,12 +1857,27 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	}
>  	WARN_ON_ONCE(end_pos <= pos);
>  
> -	if (!ifs && nblocks > 1) {
> -		ifs = ifs_alloc(inode, folio, 0);
> -		iomap_set_range_dirty(folio, 0, end_pos - pos);
> +	if (nblocks > 1) {
> +		if (!ifs) {
> +			ifs = ifs_alloc(inode, folio, 0);
> +			iomap_set_range_dirty(folio, 0, end_pos - pos);
> +		}
> +
> +		/*
> +		 * Keep the I/O completion handler from clearing the writeback
> +		 * bit until we have submitted all blocks by adding a bias to
> +		 * ifs->write_bytes_pending, which is dropped after submitting
> +		 * all blocks.
> +		 */
> +		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
> +		atomic_inc(&ifs->write_bytes_pending);
>  	}
>  
> -	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
> +	/*
> +	 * Set the writeback bit ASAP, as the I/O completion for the single
> +	 * block per folio case happen hit as soon as we're submitting the bio.
> +	 */
> +	folio_start_writeback(folio);
>  
>  	/*
>  	 * Walk through the folio to find areas to write back. If we
> @@ -1874,18 +1888,13 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
>  			continue;
>  		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos,
> -				&count, &submit_list);
> +				&count);
>  		if (error)
>  			break;
>  	}
>  	if (count)
>  		wpc->nr_folios++;
>  
> -	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
> -	WARN_ON_ONCE(!folio_test_locked(folio));
> -	WARN_ON_ONCE(folio_test_writeback(folio));
> -	WARN_ON_ONCE(folio_test_dirty(folio));
> -
>  	/*
>  	 * We can have dirty bits set past end of file in page_mkwrite path
>  	 * while mapping the last partial folio. Hence it's better to clear
> @@ -1893,35 +1902,21 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 */
>  	iomap_clear_range_dirty(folio, 0, folio_size(folio));
>  
> -	if (error && !count) {
> -		folio_unlock(folio);
> -		goto done;
> -	}
> -
> -	folio_start_writeback(folio);
> -	folio_unlock(folio);
> -
>  	/*
> -	 * Preserve the original error if there was one; catch
> -	 * submission errors here and propagate into subsequent ioend
> -	 * submissions.
> +	 * Usually the writeback bit is cleared by the I/O completion handler.
> +	 * But we may end up either not actually writing any blocks, or (when
> +	 * there are multiple blocks in a folio) all I/O might have finished
> +	 * already at this point.  In that case we need to clear the writeback
> +	 * bit ourselves right after unlocking the page.
>  	 */
> -	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
> -		int error2;
> -
> -		list_del_init(&ioend->io_list);
> -		error2 = iomap_submit_ioend(wpc, ioend, error);
> -		if (error2 && !error)
> -			error = error2;
> +	folio_unlock(folio);
> +	if (ifs) {
> +		if (atomic_dec_and_test(&ifs->write_bytes_pending))
> +			folio_end_writeback(folio);
> +	} else {
> +		if (!count)
> +			folio_end_writeback(folio);
>  	}
> -
> -	/*
> -	 * We can end up here with no error and nothing to write only if we race
> -	 * with a partial page truncate on a sub-page block sized filesystem.
> -	 */
> -	if (!count)
> -		folio_end_writeback(folio);
> -done:
>  	mapping_set_error(inode->i_mapping, error);
>  	return error;
>  }
> @@ -1941,9 +1936,7 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
>  
>  	wpc->ops = ops;
>  	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
> -	if (!wpc->ioend)
> -		return ret;
> -	return iomap_submit_ioend(wpc, wpc->ioend, ret);
> +	return iomap_submit_ioend(wpc, ret);
>  }
>  EXPORT_SYMBOL_GPL(iomap_writepages);
>  
> -- 
> 2.39.2
> 
>
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9f223820f60d22..a01b0686e7c8a0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1620,30 +1620,34 @@  static void iomap_writepage_end_bio(struct bio *bio)
  * Submit the final bio for an ioend.
  *
  * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them.  In this situation, we need to fail the bio instead of
- * submitting it.  This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
  */
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
-		int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 {
+	if (!wpc->ioend)
+		return error;
+
+	/*
+	 * Let the file systems prepare the I/O submission and hook in an I/O
+	 * comletion handler.  This also needs to happen in case after a
+	 * failure happened so that the file system end I/O handler gets called
+	 * to clean up.
+	 */
 	if (wpc->ops->prepare_ioend)
-		error = wpc->ops->prepare_ioend(ioend, error);
+		error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
 	if (error) {
-		/*
-		 * If we're failing the IO now, just mark the ioend with an
-		 * error and finish it.  This will run IO completion immediately
-		 * as there is only one reference to the ioend at this point in
-		 * time.
-		 */
-		ioend->io_bio.bi_status = errno_to_blk_status(error);
-		bio_endio(&ioend->io_bio);
-		return error;
+		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&wpc->ioend->io_bio);
+	} else {
+		submit_bio(&wpc->ioend->io_bio);
 	}
 
-	submit_bio(&ioend->io_bio);
-	return 0;
+	wpc->ioend = NULL;
+	return error;
 }
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
@@ -1698,19 +1702,28 @@  iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset)
 /*
  * Test to see if we have an existing ioend structure that we could append to
  * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly.  Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
  */
-static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, loff_t pos, struct list_head *iolist)
+		struct inode *inode, loff_t pos)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	unsigned len = i_blocksize(inode);
 	size_t poff = offset_in_folio(folio, pos);
+	int error;
 
 	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
 new_ioend:
-		if (wpc->ioend)
-			list_add(&wpc->ioend->io_list, iolist);
+		error = iomap_submit_ioend(wpc, 0);
+		if (error)
+			return error;
 		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
@@ -1721,12 +1734,12 @@  static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		atomic_add(len, &ifs->write_bytes_pending);
 	wpc->ioend->io_size += len;
 	wbc_account_cgroup_owner(wbc, &folio->page, len);
+	return 0;
 }
 
 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, u64 pos, unsigned *count,
-		struct list_head *submit_list)
+		struct inode *inode, u64 pos, unsigned *count)
 {
 	int error;
 
@@ -1743,7 +1756,7 @@  static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 	case IOMAP_HOLE:
 		break;
 	default:
-		iomap_add_to_ioend(wpc, wbc, folio, inode, pos, submit_list);
+		iomap_add_to_ioend(wpc, wbc, folio, inode, pos);
 		(*count)++;
 	}
 
@@ -1820,35 +1833,21 @@  static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
 	return true;
 }
 
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected.  While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
 static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	struct inode *inode = folio->mapping->host;
-	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
 	unsigned count = 0;
 	int error = 0, i;
-	LIST_HEAD(submit_list);
+
+	WARN_ON_ONCE(!folio_test_locked(folio));
+	WARN_ON_ONCE(folio_test_dirty(folio));
+	WARN_ON_ONCE(folio_test_writeback(folio));
 
 	trace_iomap_writepage(inode, pos, folio_size(folio));
 
@@ -1858,12 +1857,27 @@  static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	}
 	WARN_ON_ONCE(end_pos <= pos);
 
-	if (!ifs && nblocks > 1) {
-		ifs = ifs_alloc(inode, folio, 0);
-		iomap_set_range_dirty(folio, 0, end_pos - pos);
+	if (nblocks > 1) {
+		if (!ifs) {
+			ifs = ifs_alloc(inode, folio, 0);
+			iomap_set_range_dirty(folio, 0, end_pos - pos);
+		}
+
+		/*
+		 * Keep the I/O completion handler from clearing the writeback
+		 * bit until we have submitted all blocks by adding a bias to
+		 * ifs->write_bytes_pending, which is dropped after submitting
+		 * all blocks.
+		 */
+		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+		atomic_inc(&ifs->write_bytes_pending);
 	}
 
-	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
+	/*
+	 * Set the writeback bit ASAP, as the I/O completion for the single
+	 * block per folio case happen hit as soon as we're submitting the bio.
+	 */
+	folio_start_writeback(folio);
 
 	/*
 	 * Walk through the folio to find areas to write back. If we
@@ -1874,18 +1888,13 @@  static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
 			continue;
 		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos,
-				&count, &submit_list);
+				&count);
 		if (error)
 			break;
 	}
 	if (count)
 		wpc->nr_folios++;
 
-	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
-	WARN_ON_ONCE(!folio_test_locked(folio));
-	WARN_ON_ONCE(folio_test_writeback(folio));
-	WARN_ON_ONCE(folio_test_dirty(folio));
-
 	/*
 	 * We can have dirty bits set past end of file in page_mkwrite path
 	 * while mapping the last partial folio. Hence it's better to clear
@@ -1893,35 +1902,21 @@  static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 */
 	iomap_clear_range_dirty(folio, 0, folio_size(folio));
 
-	if (error && !count) {
-		folio_unlock(folio);
-		goto done;
-	}
-
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-
 	/*
-	 * Preserve the original error if there was one; catch
-	 * submission errors here and propagate into subsequent ioend
-	 * submissions.
+	 * Usually the writeback bit is cleared by the I/O completion handler.
+	 * But we may end up either not actually writing any blocks, or (when
+	 * there are multiple blocks in a folio) all I/O might have finished
+	 * already at this point.  In that case we need to clear the writeback
+	 * bit ourselves right after unlocking the page.
 	 */
-	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-		int error2;
-
-		list_del_init(&ioend->io_list);
-		error2 = iomap_submit_ioend(wpc, ioend, error);
-		if (error2 && !error)
-			error = error2;
+	folio_unlock(folio);
+	if (ifs) {
+		if (atomic_dec_and_test(&ifs->write_bytes_pending))
+			folio_end_writeback(folio);
+	} else {
+		if (!count)
+			folio_end_writeback(folio);
 	}
-
-	/*
-	 * We can end up here with no error and nothing to write only if we race
-	 * with a partial page truncate on a sub-page block sized filesystem.
-	 */
-	if (!count)
-		folio_end_writeback(folio);
-done:
 	mapping_set_error(inode->i_mapping, error);
 	return error;
 }
@@ -1941,9 +1936,7 @@  iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 
 	wpc->ops = ops;
 	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
-	if (!wpc->ioend)
-		return ret;
-	return iomap_submit_ioend(wpc, wpc->ioend, ret);
+	return iomap_submit_ioend(wpc, ret);
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);