diff mbox series

[4/4] iomap: cleanup iomap_write_iter()

Message ID 20240311122255.2637311-5-yi.zhang@huaweicloud.com (mailing list archive)
State New
Headers show
Series xfs/iomap: fix non-atomic clone operation and don't update size when zeroing range post eof | expand

Commit Message

Zhang Yi March 11, 2024, 12:22 p.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

The status variable in iomap_write_iter() is confusing and
iomap_write_end() always return 0 or copied bytes, so replace it with a
new written variable to represent the written bytes in each cycle, and
also do some cleanup, no logic changes.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/iomap/buffered-io.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

Comments

Darrick J. Wong March 11, 2024, 4:07 p.m. UTC | #1
On Mon, Mar 11, 2024 at 08:22:55PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> The status variable in iomap_write_iter() is confusing and
> iomap_write_end() always return 0 or copied bytes, so replace it with a
> new written variable to represent the written bytes in each cycle, and
> also do some cleanup, no logic changes.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/iomap/buffered-io.c | 31 +++++++++++++++----------------
>  1 file changed, 15 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 19f91324c690..767af6e67ed4 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -851,7 +851,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  	loff_t length = iomap_length(iter);
>  	size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
>  	loff_t pos = iter->pos;
> -	ssize_t written = 0;
> +	ssize_t total_written = 0;
>  	long status = 0;
>  	struct address_space *mapping = iter->inode->i_mapping;
>  	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
> @@ -862,6 +862,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  		size_t offset;		/* Offset into folio */
>  		size_t bytes;		/* Bytes to write to folio */
>  		size_t copied;		/* Bytes copied from user */
> +		size_t written;		/* Bytes have been written */
>  
>  		bytes = iov_iter_count(i);
>  retry:
> @@ -906,7 +907,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  			flush_dcache_folio(folio);
>  
>  		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> -		status = iomap_write_end(iter, pos, bytes, copied, folio);
> +		written = iomap_write_end(iter, pos, bytes, copied, folio);
>  
>  		/*
>  		 * Update the in-memory inode size after copying the data into
> @@ -915,28 +916,26 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  		 * no stale data is exposed.
>  		 */
>  		old_size = iter->inode->i_size;
> -		if (pos + status > old_size) {
> -			i_size_write(iter->inode, pos + status);
> +		if (pos + written > old_size) {
> +			i_size_write(iter->inode, pos + written);
>  			iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
>  		}
> -		__iomap_put_folio(iter, pos, status, folio);
> +		__iomap_put_folio(iter, pos, written, folio);
>  
>  		if (old_size < pos)
>  			pagecache_isize_extended(iter->inode, old_size, pos);
> -		if (status < bytes)
> -			iomap_write_failed(iter->inode, pos + status,
> -					   bytes - status);
> -		if (unlikely(copied != status))
> -			iov_iter_revert(i, copied - status);

I wish you'd made the variable renaming and the function reorganization
separate patches.  The renaming looks correct to me, but moving these
calls adds a logic bomb.

If at some point iomap_write_end actually starts returning partial write
completions (e.g. you wrote 250 bytes, but for some reason the pagecache
only acknowledges 100 bytes were written) then this code no longer
reverts the iter or truncates posteof pagecache correctly...

>  
>  		cond_resched();
> -		if (unlikely(status == 0)) {
> +		if (unlikely(written == 0)) {
>  			/*
>  			 * A short copy made iomap_write_end() reject the
>  			 * thing entirely.  Might be memory poisoning
>  			 * halfway through, might be a race with munmap,
>  			 * might be severe memory pressure.
>  			 */
> +			iomap_write_failed(iter->inode, pos, bytes);
> +			iov_iter_revert(i, copied);

...because now we only do that if the pagecache refuses to acknowledge
any bytes written at all.  I think it actually works correctly with
today's kernel since __iomap_write_end only returns copied or 0, but the
size_t return type implies that a short acknowledgement is theoretically
possible.

IOWs, doesn't this adds a logic bomb?

--D

> +
>  			if (chunk > PAGE_SIZE)
>  				chunk /= 2;
>  			if (copied) {
> @@ -944,17 +943,17 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  				goto retry;
>  			}
>  		} else {
> -			pos += status;
> -			written += status;
> -			length -= status;
> +			pos += written;
> +			total_written += written;
> +			length -= written;
>  		}
>  	} while (iov_iter_count(i) && length);
>  
>  	if (status == -EAGAIN) {
> -		iov_iter_revert(i, written);
> +		iov_iter_revert(i, total_written);
>  		return -EAGAIN;
>  	}
> -	return written ? written : status;
> +	return total_written ? total_written : status;
>  }
>  
>  ssize_t
> -- 
> 2.39.2
> 
>
Christoph Hellwig March 12, 2024, 12:24 p.m. UTC | #2
On Mon, Mar 11, 2024 at 09:07:39AM -0700, Darrick J. Wong wrote:
> If at some point iomap_write_end actually starts returning partial write
> completions (e.g. you wrote 250 bytes, but for some reason the pagecache
> only acknowledges 100 bytes were written) then this code no longer
> reverts the iter or truncates posteof pagecache correctly...

I don't think it makes sense to return a partial write from
iomap_write_end.  But to make that clear it really should not return
a byte count by a boolean.  I've been wanting to make that cleanup
for a while, but it would reach all the way into buffer.c.
Darrick J. Wong March 12, 2024, 4:27 p.m. UTC | #3
On Tue, Mar 12, 2024 at 05:24:00AM -0700, Christoph Hellwig wrote:
> On Mon, Mar 11, 2024 at 09:07:39AM -0700, Darrick J. Wong wrote:
> > If at some point iomap_write_end actually starts returning partial write
> > completions (e.g. you wrote 250 bytes, but for some reason the pagecache
> > only acknowledges 100 bytes were written) then this code no longer
> > reverts the iter or truncates posteof pagecache correctly...
> 
> I don't think it makes sense to return a partial write from
> iomap_write_end.  But to make that clear it really should not return
> a byte count by a boolean.  I've been wanting to make that cleanup
> for a while, but it would reach all the way into buffer.c.

For now, can we change the return types of iomap_write_end_inline and
__iomap_write_end?  Then iomap can WARN_ON if the block_write_end return
value isn't 0 or copied:

	bool ret;

	if (srcmap->type == IOMAP_INLINE) {
		ret = iomap_write_end_inline(iter, folio, pos, copied);
	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
		size_t bh_written;

		bh_written = block_write_end(NULL, iter->inode->i_mapping,
				pos, len, copied, &folio->page, NULL);

		WARN_ON(bh_written != copied && bh_written != 0);
		ret = bh_written == copied;
	} else {
		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
	}

	...

	return ret;

Some day later we can circle back to bufferheads, or maybe they'll die
before we get to it. ;)

--D
Zhang Yi March 13, 2024, 9:23 a.m. UTC | #4
On 2024/3/13 0:27, Darrick J. Wong wrote:
> On Tue, Mar 12, 2024 at 05:24:00AM -0700, Christoph Hellwig wrote:
>> On Mon, Mar 11, 2024 at 09:07:39AM -0700, Darrick J. Wong wrote:
>>> If at some point iomap_write_end actually starts returning partial write
>>> completions (e.g. you wrote 250 bytes, but for some reason the pagecache
>>> only acknowledges 100 bytes were written) then this code no longer
>>> reverts the iter or truncates posteof pagecache correctly...
>>
>> I don't think it makes sense to return a partial write from
>> iomap_write_end.  But to make that clear it really should not return
>> a byte count by a boolean.  I've been wanting to make that cleanup
>> for a while, but it would reach all the way into buffer.c.
> 
> For now, can we change the return types of iomap_write_end_inline and
> __iomap_write_end?  Then iomap can WARN_ON if the block_write_end return
> value isn't 0 or copied:
> 
> 	bool ret;
> 
> 	if (srcmap->type == IOMAP_INLINE) {
> 		ret = iomap_write_end_inline(iter, folio, pos, copied);
> 	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> 		size_t bh_written;
> 
> 		bh_written = block_write_end(NULL, iter->inode->i_mapping,
> 				pos, len, copied, &folio->page, NULL);
> 
> 		WARN_ON(bh_written != copied && bh_written != 0);
> 		ret = bh_written == copied;
> 	} else {
> 		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> 	}
> 
> 	...
> 
> 	return ret;
> 
> Some day later we can circle back to bufferheads, or maybe they'll die
> before we get to it. ;)
> 

It looks great to me for now, we can revise iomap first.

Thanks,
Yi.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 19f91324c690..767af6e67ed4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -851,7 +851,7 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 	loff_t length = iomap_length(iter);
 	size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
 	loff_t pos = iter->pos;
-	ssize_t written = 0;
+	ssize_t total_written = 0;
 	long status = 0;
 	struct address_space *mapping = iter->inode->i_mapping;
 	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -862,6 +862,7 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		size_t offset;		/* Offset into folio */
 		size_t bytes;		/* Bytes to write to folio */
 		size_t copied;		/* Bytes copied from user */
+		size_t written;		/* Bytes have been written */
 
 		bytes = iov_iter_count(i);
 retry:
@@ -906,7 +907,7 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 			flush_dcache_folio(folio);
 
 		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
-		status = iomap_write_end(iter, pos, bytes, copied, folio);
+		written = iomap_write_end(iter, pos, bytes, copied, folio);
 
 		/*
 		 * Update the in-memory inode size after copying the data into
@@ -915,28 +916,26 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		 * no stale data is exposed.
 		 */
 		old_size = iter->inode->i_size;
-		if (pos + status > old_size) {
-			i_size_write(iter->inode, pos + status);
+		if (pos + written > old_size) {
+			i_size_write(iter->inode, pos + written);
 			iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
 		}
-		__iomap_put_folio(iter, pos, status, folio);
+		__iomap_put_folio(iter, pos, written, folio);
 
 		if (old_size < pos)
 			pagecache_isize_extended(iter->inode, old_size, pos);
-		if (status < bytes)
-			iomap_write_failed(iter->inode, pos + status,
-					   bytes - status);
-		if (unlikely(copied != status))
-			iov_iter_revert(i, copied - status);
 
 		cond_resched();
-		if (unlikely(status == 0)) {
+		if (unlikely(written == 0)) {
 			/*
 			 * A short copy made iomap_write_end() reject the
 			 * thing entirely.  Might be memory poisoning
 			 * halfway through, might be a race with munmap,
 			 * might be severe memory pressure.
 			 */
+			iomap_write_failed(iter->inode, pos, bytes);
+			iov_iter_revert(i, copied);
+
 			if (chunk > PAGE_SIZE)
 				chunk /= 2;
 			if (copied) {
@@ -944,17 +943,17 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 				goto retry;
 			}
 		} else {
-			pos += status;
-			written += status;
-			length -= status;
+			pos += written;
+			total_written += written;
+			length -= written;
 		}
 	} while (iov_iter_count(i) && length);
 
 	if (status == -EAGAIN) {
-		iov_iter_revert(i, written);
+		iov_iter_revert(i, total_written);
 		return -EAGAIN;
 	}
-	return written ? written : status;
+	return total_written ? total_written : status;
 }
 
 ssize_t