diff mbox series

[v4,01/10] ext4: remove writable userspace mappings before truncating page cache

Message ID 20241216013915.3392419-2-yi.zhang@huaweicloud.com (mailing list archive)
State New
Headers show
Series ext4: clean up and refactor fallocate | expand

Commit Message

Zhang Yi Dec. 16, 2024, 1:39 a.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

When zeroing a range of folios on the filesystem which block size is
less than the page size, the file's mapped blocks within one page will
be marked as unwritten, we should remove writable userspace mappings to
ensure that ext4_page_mkwrite() can be called during subsequent write
access to these partial folios. Otherwise, data written by subsequent
mmap writes may not be saved to disk.

 $mkfs.ext4 -b 1024 /dev/vdb
 $mount /dev/vdb /mnt
 $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
               -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
               -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo

 $od -Ax -t x1z /mnt/foo
 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
 *
 000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
 *
 001000

 $umount /mnt && mount /dev/vdb /mnt
 $od -Ax -t x1z /mnt/foo
 000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
 *
 000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 *
 001000

Fix this by introducing ext4_truncate_page_cache_block_range() to remove
writable userspace mappings when truncating a partial folio range.
Additionally, move the journal data mode-specific handlers and
truncate_pagecache_range() into this function, allowing it to serve as a
common helper that correctly manages the page cache in preparation for
block range manipulations.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/extents.c | 19 ++++-----------
 fs/ext4/inode.c   | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 14 deletions(-)

Comments

Jan Kara Dec. 16, 2024, 3 p.m. UTC | #1
On Mon 16-12-24 09:39:06, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> When zeroing a range of folios on the filesystem which block size is
> less than the page size, the file's mapped blocks within one page will
> be marked as unwritten, we should remove writable userspace mappings to
> ensure that ext4_page_mkwrite() can be called during subsequent write
> access to these partial folios. Otherwise, data written by subsequent
> mmap writes may not be saved to disk.
> 
>  $mkfs.ext4 -b 1024 /dev/vdb
>  $mount /dev/vdb /mnt
>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
> 
>  $od -Ax -t x1z /mnt/foo
>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>  *
>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>  *
>  001000
> 
>  $umount /mnt && mount /dev/vdb /mnt
>  $od -Ax -t x1z /mnt/foo
>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>  *
>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  *
>  001000
> 
> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
> writable userspace mappings when truncating a partial folio range.
> Additionally, move the journal data mode-specific handlers and
> truncate_pagecache_range() into this function, allowing it to serve as a
> common helper that correctly manages the page cache in preparation for
> block range manipulations.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

I like the patch. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

Just one thing occured to me when thinking about this: It seems like a
nasty catch that truncate_inode_pages_range() does not writeprotect these
partial pages because practically any filesystem supporting blocksize <
pagesize and doing anything non-trivial in ->page_mkwrite handler will need
this. So ultimately I think we might want to fix this in generic code but
ext4 solution is fine for now.

								Honza
> ---
>  fs/ext4/ext4.h    |  2 ++
>  fs/ext4/extents.c | 19 ++++-----------
>  fs/ext4/inode.c   | 62 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 69 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 74f2071189b2..8843929b46ce 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
>  extern int ext4_can_truncate(struct inode *inode);
>  extern int ext4_truncate(struct inode *);
>  extern int ext4_break_layouts(struct inode *);
> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
> +						loff_t start, loff_t end);
>  extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
>  extern void ext4_set_inode_flags(struct inode *, bool init);
>  extern int ext4_alloc_da_blocks(struct inode *inode);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index a07a98a4b97a..8dc6b4271b15 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  			goto out_mutex;
>  		}
>  
> -		/*
> -		 * For journalled data we need to write (and checkpoint) pages
> -		 * before discarding page cache to avoid inconsitent data on
> -		 * disk in case of crash before zeroing trans is committed.
> -		 */
> -		if (ext4_should_journal_data(inode)) {
> -			ret = filemap_write_and_wait_range(mapping, start,
> -							   end - 1);
> -			if (ret) {
> -				filemap_invalidate_unlock(mapping);
> -				goto out_mutex;
> -			}
> +		/* Now release the pages and zero block aligned part of pages */
> +		ret = ext4_truncate_page_cache_block_range(inode, start, end);
> +		if (ret) {
> +			filemap_invalidate_unlock(mapping);
> +			goto out_mutex;
>  		}
>  
> -		/* Now release the pages and zero block aligned part of pages */
> -		truncate_pagecache_range(inode, start, end - 1);
>  		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>  
>  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 89aade6f45f6..c68a8b841148 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -31,6 +31,7 @@
>  #include <linux/writeback.h>
>  #include <linux/pagevec.h>
>  #include <linux/mpage.h>
> +#include <linux/rmap.h>
>  #include <linux/namei.h>
>  #include <linux/uio.h>
>  #include <linux/bio.h>
> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
>  	return ret;
>  }
>  
> +static inline void ext4_truncate_folio(struct inode *inode,
> +				       loff_t start, loff_t end)
> +{
> +	unsigned long blocksize = i_blocksize(inode);
> +	struct folio *folio;
> +
> +	/* Nothing to be done if no complete block needs to be truncated. */
> +	if (round_up(start, blocksize) >= round_down(end, blocksize))
> +		return;
> +
> +	folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
> +	if (IS_ERR(folio))
> +		return;
> +
> +	if (folio_mkclean(folio))
> +		folio_mark_dirty(folio);
> +	folio_unlock(folio);
> +	folio_put(folio);
> +}
> +
> +int ext4_truncate_page_cache_block_range(struct inode *inode,
> +					 loff_t start, loff_t end)
> +{
> +	unsigned long blocksize = i_blocksize(inode);
> +	int ret;
> +
> +	/*
> +	 * For journalled data we need to write (and checkpoint) pages
> +	 * before discarding page cache to avoid inconsitent data on disk
> +	 * in case of crash before freeing or unwritten converting trans
> +	 * is committed.
> +	 */
> +	if (ext4_should_journal_data(inode)) {
> +		ret = filemap_write_and_wait_range(inode->i_mapping, start,
> +						   end - 1);
> +		if (ret)
> +			return ret;
> +		goto truncate_pagecache;
> +	}
> +
> +	/*
> +	 * If the block size is less than the page size, the file's mapped
> +	 * blocks within one page could be freed or converted to unwritten.
> +	 * So it's necessary to remove writable userspace mappings, and then
> +	 * ext4_page_mkwrite() can be called during subsequent write access
> +	 * to these partial folios.
> +	 */
> +	if (blocksize < PAGE_SIZE && start < inode->i_size) {
> +		loff_t start_boundary = round_up(start, PAGE_SIZE);
> +
> +		ext4_truncate_folio(inode, start, min(start_boundary, end));
> +		if (end > start_boundary)
> +			ext4_truncate_folio(inode,
> +					    round_down(end, PAGE_SIZE), end);
> +	}
> +
> +truncate_pagecache:
> +	truncate_pagecache_range(inode, start, end - 1);
> +	return 0;
> +}
> +
>  static void ext4_wait_dax_page(struct inode *inode)
>  {
>  	filemap_invalidate_unlock(inode->i_mapping);
> -- 
> 2.46.1
>
Matthew Wilcox (Oracle) Dec. 16, 2024, 3:15 p.m. UTC | #2
On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
>  $mkfs.ext4 -b 1024 /dev/vdb
>  $mount /dev/vdb /mnt
>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
> 
>  $od -Ax -t x1z /mnt/foo
>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>  *
>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>  *
>  001000
> 
>  $umount /mnt && mount /dev/vdb /mnt
>  $od -Ax -t x1z /mnt/foo
>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>  *
>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  *
>  001000

Can you add this to fstests please so we can be sure other filesystems
don't have the same problem?
Zhang Yi Dec. 17, 2024, 7:05 a.m. UTC | #3
On 2024/12/16 23:00, Jan Kara wrote:
> On Mon 16-12-24 09:39:06, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> When zeroing a range of folios on the filesystem which block size is
>> less than the page size, the file's mapped blocks within one page will
>> be marked as unwritten, we should remove writable userspace mappings to
>> ensure that ext4_page_mkwrite() can be called during subsequent write
>> access to these partial folios. Otherwise, data written by subsequent
>> mmap writes may not be saved to disk.
>>
>>  $mkfs.ext4 -b 1024 /dev/vdb
>>  $mount /dev/vdb /mnt
>>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>>
>>  $od -Ax -t x1z /mnt/foo
>>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>>  *
>>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>>  *
>>  001000
>>
>>  $umount /mnt && mount /dev/vdb /mnt
>>  $od -Ax -t x1z /mnt/foo
>>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>>  *
>>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>  *
>>  001000
>>
>> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
>> writable userspace mappings when truncating a partial folio range.
>> Additionally, move the journal data mode-specific handlers and
>> truncate_pagecache_range() into this function, allowing it to serve as a
>> common helper that correctly manages the page cache in preparation for
>> block range manipulations.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> 
> I like the patch. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> Just one thing occured to me when thinking about this: It seems like a
> nasty catch that truncate_inode_pages_range() does not writeprotect these
> partial pages because practically any filesystem supporting blocksize <
> pagesize and doing anything non-trivial in ->page_mkwrite handler will need
> this. So ultimately I think we might want to fix this in generic code but
> ext4 solution is fine for now.
> 

Yes, I agree with you. I've checked XFS, Btrfs, and Bcachefs. Currently,
XFS and Btrfs choose to perform writeback before truncating partial
folios, so they do not require this at the moment. However, it appears
that Bcachefs also does a similar approach in __bch2_truncate_folio().
So I think do this in generic code should be better too.

Thanks,
Yi.
Zhang Yi Dec. 17, 2024, 7:38 a.m. UTC | #4
On 2024/12/16 23:15, Matthew Wilcox wrote:
> On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
>>  $mkfs.ext4 -b 1024 /dev/vdb
>>  $mount /dev/vdb /mnt
>>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>>
>>  $od -Ax -t x1z /mnt/foo
>>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>>  *
>>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>>  *
>>  001000
>>
>>  $umount /mnt && mount /dev/vdb /mnt
>>  $od -Ax -t x1z /mnt/foo
>>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>>  *
>>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>  *
>>  001000
> 
> Can you add this to fstests please so we can be sure other filesystems
> don't have the same problem?

Sure, I captured this issue by generic/567 while refactoring punch
hole operation on ext4. The generic/567 only performs a partial punch
hole test but does not include a partial zero range test, so we
did not capture this issue. I will expand this test and add this case.

Thanks,
Yi.
Ojaswin Mujoo Dec. 18, 2024, 9:56 a.m. UTC | #5
On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> When zeroing a range of folios on the filesystem which block size is
> less than the page size, the file's mapped blocks within one page will
> be marked as unwritten, we should remove writable userspace mappings to
> ensure that ext4_page_mkwrite() can be called during subsequent write
> access to these partial folios. Otherwise, data written by subsequent
> mmap writes may not be saved to disk.
> 
>  $mkfs.ext4 -b 1024 /dev/vdb
>  $mount /dev/vdb /mnt
>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
> 
>  $od -Ax -t x1z /mnt/foo
>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>  *
>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>  *
>  001000
> 
>  $umount /mnt && mount /dev/vdb /mnt
>  $od -Ax -t x1z /mnt/foo
>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>  *
>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  *
>  001000
> 
> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
> writable userspace mappings when truncating a partial folio range.
> Additionally, move the journal data mode-specific handlers and
> truncate_pagecache_range() into this function, allowing it to serve as a
> common helper that correctly manages the page cache in preparation for
> block range manipulations.

Hi Zhang,

Thanks for the fix, just to confirm my understanding, the issue arises
because of the following flow:

1. page_mkwrite() makes folio dirty when we write to the mmap'd region

2. ext4_zero_range (2kb to 4kb)
    truncate_pagecache_range
      truncate_inode_pages_range
        truncate_inode_partial_folio
          folio_zero_range (2kb to 4kb)
            folio_invalidate
              ext4_invalidate_folio
                block_invalidate_folio -> clear the bh dirty bit

3. mwrite (2kb to 4kb): Again we write in pagecache but the bh is not
   dirty hence after a remount the data is not seen on disk

Also, we won't see this issue if we are zeroing a page aligned range
since we end up unmapping the pages from the proccess address space in 
that case. Correct?

I have also tested the patch in PowerPC with 64k pagesize and 4k blocks
size and can confirm that it fixes the data loss issue. That being said,
I have a few minor comments on the patch below:

> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/ext4/ext4.h    |  2 ++
>  fs/ext4/extents.c | 19 ++++-----------
>  fs/ext4/inode.c   | 62 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 69 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 74f2071189b2..8843929b46ce 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
>  extern int ext4_can_truncate(struct inode *inode);
>  extern int ext4_truncate(struct inode *);
>  extern int ext4_break_layouts(struct inode *);
> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
> +						loff_t start, loff_t end);
>  extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
>  extern void ext4_set_inode_flags(struct inode *, bool init);
>  extern int ext4_alloc_da_blocks(struct inode *inode);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index a07a98a4b97a..8dc6b4271b15 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  			goto out_mutex;
>  		}
>  
> -		/*
> -		 * For journalled data we need to write (and checkpoint) pages
> -		 * before discarding page cache to avoid inconsitent data on
> -		 * disk in case of crash before zeroing trans is committed.
> -		 */
> -		if (ext4_should_journal_data(inode)) {
> -			ret = filemap_write_and_wait_range(mapping, start,
> -							   end - 1);
> -			if (ret) {
> -				filemap_invalidate_unlock(mapping);
> -				goto out_mutex;
> -			}
> +		/* Now release the pages and zero block aligned part of pages */
> +		ret = ext4_truncate_page_cache_block_range(inode, start, end);
> +		if (ret) {
> +			filemap_invalidate_unlock(mapping);
> +			goto out_mutex;
>  		}
>  
> -		/* Now release the pages and zero block aligned part of pages */
> -		truncate_pagecache_range(inode, start, end - 1);
>  		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>  
>  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 89aade6f45f6..c68a8b841148 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -31,6 +31,7 @@
>  #include <linux/writeback.h>
>  #include <linux/pagevec.h>
>  #include <linux/mpage.h>
> +#include <linux/rmap.h>
>  #include <linux/namei.h>
>  #include <linux/uio.h>
>  #include <linux/bio.h>
> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
>  	return ret;
>  }
>  
> +static inline void ext4_truncate_folio(struct inode *inode,
> +				       loff_t start, loff_t end)
> +{
> +	unsigned long blocksize = i_blocksize(inode);
> +	struct folio *folio;
> +
> +	/* Nothing to be done if no complete block needs to be truncated. */
> +	if (round_up(start, blocksize) >= round_down(end, blocksize))
> +		return;
> +
> +	folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
> +	if (IS_ERR(folio))
> +		return;
> +
> +	if (folio_mkclean(folio))
> +		folio_mark_dirty(folio);
> +	folio_unlock(folio);
> +	folio_put(folio);
> +}
> +
> +int ext4_truncate_page_cache_block_range(struct inode *inode,
> +					 loff_t start, loff_t end)
> +{
> +	unsigned long blocksize = i_blocksize(inode);
> +	int ret;
> +
> +	/*
> +	 * For journalled data we need to write (and checkpoint) pages
> +	 * before discarding page cache to avoid inconsitent data on disk
> +	 * in case of crash before freeing or unwritten converting trans
> +	 * is committed.
> +	 */
> +	if (ext4_should_journal_data(inode)) {
> +		ret = filemap_write_and_wait_range(inode->i_mapping, start,
> +						   end - 1);
> +		if (ret)
> +			return ret;
> +		goto truncate_pagecache;
> +	}
> +
> +	/*
> +	 * If the block size is less than the page size, the file's mapped
> +	 * blocks within one page could be freed or converted to unwritten.
> +	 * So it's necessary to remove writable userspace mappings, and then
> +	 * ext4_page_mkwrite() can be called during subsequent write access
> +	 * to these partial folios.
> +	 */
> +	if (blocksize < PAGE_SIZE && start < inode->i_size) {

Maybe we should only call ext4_truncate_folio() if the range is not page
aligned, rather than calling it everytime for bs < ps?

> +		loff_t start_boundary = round_up(start, PAGE_SIZE);

I think page_boundary seems like a more suitable name for the variable.

Regards,
ojaswin

> +
> +		ext4_truncate_folio(inode, start, min(start_boundary, end));
> +		if (end > start_boundary)
> +			ext4_truncate_folio(inode,
> +					    round_down(end, PAGE_SIZE), end);
> +	}
> +
> +truncate_pagecache:
> +	truncate_pagecache_range(inode, start, end - 1);
> +	return 0;
> +}
> +
>  static void ext4_wait_dax_page(struct inode *inode)
>  {
>  	filemap_invalidate_unlock(inode->i_mapping);
> -- 
> 2.46.1
>
Zhang Yi Dec. 18, 2024, 1:02 p.m. UTC | #6
On 2024/12/18 17:56, Ojaswin Mujoo wrote:
> On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> When zeroing a range of folios on the filesystem which block size is
>> less than the page size, the file's mapped blocks within one page will
>> be marked as unwritten, we should remove writable userspace mappings to
>> ensure that ext4_page_mkwrite() can be called during subsequent write
>> access to these partial folios. Otherwise, data written by subsequent
>> mmap writes may not be saved to disk.
>>
>>  $mkfs.ext4 -b 1024 /dev/vdb
>>  $mount /dev/vdb /mnt
>>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
>>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
>>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
>>
>>  $od -Ax -t x1z /mnt/foo
>>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>>  *
>>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
>>  *
>>  001000
>>
>>  $umount /mnt && mount /dev/vdb /mnt
>>  $od -Ax -t x1z /mnt/foo
>>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
>>  *
>>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>  *
>>  001000
>>
>> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
>> writable userspace mappings when truncating a partial folio range.
>> Additionally, move the journal data mode-specific handlers and
>> truncate_pagecache_range() into this function, allowing it to serve as a
>> common helper that correctly manages the page cache in preparation for
>> block range manipulations.
> 
> Hi Zhang,
> 
> Thanks for the fix, just to confirm my understanding, the issue arises
> because of the following flow:
> 
> 1. page_mkwrite() makes folio dirty when we write to the mmap'd region
> 
> 2. ext4_zero_range (2kb to 4kb)
>     truncate_pagecache_range
>       truncate_inode_pages_range
>         truncate_inode_partial_folio
>           folio_zero_range (2kb to 4kb)
>             folio_invalidate
>               ext4_invalidate_folio
>                 block_invalidate_folio -> clear the bh dirty bit
> 
> 3. mwrite (2kb to 4kb): Again we write in pagecache but the bh is not
>    dirty hence after a remount the data is not seen on disk
> 
> Also, we won't see this issue if we are zeroing a page aligned range
> since we end up unmapping the pages from the proccess address space in 
> that case. Correct?

Thank you for review! Yes, it's correct.

> 
> I have also tested the patch in PowerPC with 64k pagesize and 4k blocks
> size and can confirm that it fixes the data loss issue. That being said,
> I have a few minor comments on the patch below:
> 

Thank you for the test.

>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  fs/ext4/ext4.h    |  2 ++
>>  fs/ext4/extents.c | 19 ++++-----------
>>  fs/ext4/inode.c   | 62 +++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 69 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 74f2071189b2..8843929b46ce 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
>>  extern int ext4_can_truncate(struct inode *inode);
>>  extern int ext4_truncate(struct inode *);
>>  extern int ext4_break_layouts(struct inode *);
>> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
>> +						loff_t start, loff_t end);
>>  extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
>>  extern void ext4_set_inode_flags(struct inode *, bool init);
>>  extern int ext4_alloc_da_blocks(struct inode *inode);
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index a07a98a4b97a..8dc6b4271b15 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>>  			goto out_mutex;
>>  		}
>>  
>> -		/*
>> -		 * For journalled data we need to write (and checkpoint) pages
>> -		 * before discarding page cache to avoid inconsitent data on
>> -		 * disk in case of crash before zeroing trans is committed.
>> -		 */
>> -		if (ext4_should_journal_data(inode)) {
>> -			ret = filemap_write_and_wait_range(mapping, start,
>> -							   end - 1);
>> -			if (ret) {
>> -				filemap_invalidate_unlock(mapping);
>> -				goto out_mutex;
>> -			}
>> +		/* Now release the pages and zero block aligned part of pages */
>> +		ret = ext4_truncate_page_cache_block_range(inode, start, end);
>> +		if (ret) {
>> +			filemap_invalidate_unlock(mapping);
>> +			goto out_mutex;
>>  		}
>>  
>> -		/* Now release the pages and zero block aligned part of pages */
>> -		truncate_pagecache_range(inode, start, end - 1);
>>  		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>>  
>>  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 89aade6f45f6..c68a8b841148 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -31,6 +31,7 @@
>>  #include <linux/writeback.h>
>>  #include <linux/pagevec.h>
>>  #include <linux/mpage.h>
>> +#include <linux/rmap.h>
>>  #include <linux/namei.h>
>>  #include <linux/uio.h>
>>  #include <linux/bio.h>
>> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
>>  	return ret;
>>  }
>>  
>> +static inline void ext4_truncate_folio(struct inode *inode,
>> +				       loff_t start, loff_t end)
>> +{
>> +	unsigned long blocksize = i_blocksize(inode);
>> +	struct folio *folio;
>> +
>> +	/* Nothing to be done if no complete block needs to be truncated. */
>> +	if (round_up(start, blocksize) >= round_down(end, blocksize))
>> +		return;
>> +
>> +	folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
>> +	if (IS_ERR(folio))
>> +		return;
>> +
>> +	if (folio_mkclean(folio))
>> +		folio_mark_dirty(folio);
>> +	folio_unlock(folio);
>> +	folio_put(folio);
>> +}
>> +
>> +int ext4_truncate_page_cache_block_range(struct inode *inode,
>> +					 loff_t start, loff_t end)
>> +{
>> +	unsigned long blocksize = i_blocksize(inode);
>> +	int ret;
>> +
>> +	/*
>> +	 * For journalled data we need to write (and checkpoint) pages
>> +	 * before discarding page cache to avoid inconsitent data on disk
>> +	 * in case of crash before freeing or unwritten converting trans
>> +	 * is committed.
>> +	 */
>> +	if (ext4_should_journal_data(inode)) {
>> +		ret = filemap_write_and_wait_range(inode->i_mapping, start,
>> +						   end - 1);
>> +		if (ret)
>> +			return ret;
>> +		goto truncate_pagecache;
>> +	}
>> +
>> +	/*
>> +	 * If the block size is less than the page size, the file's mapped
>> +	 * blocks within one page could be freed or converted to unwritten.
>> +	 * So it's necessary to remove writable userspace mappings, and then
>> +	 * ext4_page_mkwrite() can be called during subsequent write access
>> +	 * to these partial folios.
>> +	 */
>> +	if (blocksize < PAGE_SIZE && start < inode->i_size) {
> 
> Maybe we should only call ext4_truncate_folio() if the range is not page
> aligned, rather than calling it everytime for bs < ps?

I agree with you, so how about below?

	if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
	    blocksize < PAGE_SIZE && start < inode->i_size && )

> 
>> +		loff_t start_boundary = round_up(start, PAGE_SIZE);
> 
> I think page_boundary seems like a more suitable name for the variable.

Yeah, it looks fine to me.

Thanks,
Yi.
Ojaswin Mujoo Dec. 19, 2024, 7:19 a.m. UTC | #7
On Wed, Dec 18, 2024 at 09:02:18PM +0800, Zhang Yi wrote:
> On 2024/12/18 17:56, Ojaswin Mujoo wrote:
> > On Mon, Dec 16, 2024 at 09:39:06AM +0800, Zhang Yi wrote:
> >> From: Zhang Yi <yi.zhang@huawei.com>
> >>
> >> When zeroing a range of folios on the filesystem which block size is
> >> less than the page size, the file's mapped blocks within one page will
> >> be marked as unwritten, we should remove writable userspace mappings to
> >> ensure that ext4_page_mkwrite() can be called during subsequent write
> >> access to these partial folios. Otherwise, data written by subsequent
> >> mmap writes may not be saved to disk.
> >>
> >>  $mkfs.ext4 -b 1024 /dev/vdb
> >>  $mount /dev/vdb /mnt
> >>  $xfs_io -t -f -c "pwrite -S 0x58 0 4096" -c "mmap -rw 0 4096" \
> >>                -c "mwrite -S 0x5a 2048 2048" -c "fzero 2048 2048" \
> >>                -c "mwrite -S 0x59 2048 2048" -c "close" /mnt/foo
> >>
> >>  $od -Ax -t x1z /mnt/foo
> >>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> >>  *
> >>  000800 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
> >>  *
> >>  001000
> >>
> >>  $umount /mnt && mount /dev/vdb /mnt
> >>  $od -Ax -t x1z /mnt/foo
> >>  000000 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58
> >>  *
> >>  000800 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>  *
> >>  001000
> >>
> >> Fix this by introducing ext4_truncate_page_cache_block_range() to remove
> >> writable userspace mappings when truncating a partial folio range.
> >> Additionally, move the journal data mode-specific handlers and
> >> truncate_pagecache_range() into this function, allowing it to serve as a
> >> common helper that correctly manages the page cache in preparation for
> >> block range manipulations.
> > 
> > Hi Zhang,
> > 
> > Thanks for the fix, just to confirm my understanding, the issue arises
> > because of the following flow:
> > 
> > 1. page_mkwrite() makes folio dirty when we write to the mmap'd region
> > 
> > 2. ext4_zero_range (2kb to 4kb)
> >     truncate_pagecache_range
> >       truncate_inode_pages_range
> >         truncate_inode_partial_folio
> >           folio_zero_range (2kb to 4kb)
> >             folio_invalidate
> >               ext4_invalidate_folio
> >                 block_invalidate_folio -> clear the bh dirty bit
> > 
> > 3. mwrite (2kb to 4kb): Again we write in pagecache but the bh is not
> >    dirty hence after a remount the data is not seen on disk
> > 
> > Also, we won't see this issue if we are zeroing a page aligned range
> > since we end up unmapping the pages from the proccess address space in 
> > that case. Correct?
> 
> Thank you for review! Yes, it's correct.
> 
> > 
> > I have also tested the patch in PowerPC with 64k pagesize and 4k blocks
> > size and can confirm that it fixes the data loss issue. That being said,
> > I have a few minor comments on the patch below:
> > 
> 
> Thank you for the test.
> 
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >>  fs/ext4/ext4.h    |  2 ++
> >>  fs/ext4/extents.c | 19 ++++-----------
> >>  fs/ext4/inode.c   | 62 +++++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 69 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> >> index 74f2071189b2..8843929b46ce 100644
> >> --- a/fs/ext4/ext4.h
> >> +++ b/fs/ext4/ext4.h
> >> @@ -3016,6 +3016,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
> >>  extern int ext4_can_truncate(struct inode *inode);
> >>  extern int ext4_truncate(struct inode *);
> >>  extern int ext4_break_layouts(struct inode *);
> >> +extern int ext4_truncate_page_cache_block_range(struct inode *inode,
> >> +						loff_t start, loff_t end);
> >>  extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
> >>  extern void ext4_set_inode_flags(struct inode *, bool init);
> >>  extern int ext4_alloc_da_blocks(struct inode *inode);
> >> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> >> index a07a98a4b97a..8dc6b4271b15 100644
> >> --- a/fs/ext4/extents.c
> >> +++ b/fs/ext4/extents.c
> >> @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> >>  			goto out_mutex;
> >>  		}
> >>  
> >> -		/*
> >> -		 * For journalled data we need to write (and checkpoint) pages
> >> -		 * before discarding page cache to avoid inconsitent data on
> >> -		 * disk in case of crash before zeroing trans is committed.
> >> -		 */
> >> -		if (ext4_should_journal_data(inode)) {
> >> -			ret = filemap_write_and_wait_range(mapping, start,
> >> -							   end - 1);
> >> -			if (ret) {
> >> -				filemap_invalidate_unlock(mapping);
> >> -				goto out_mutex;
> >> -			}
> >> +		/* Now release the pages and zero block aligned part of pages */
> >> +		ret = ext4_truncate_page_cache_block_range(inode, start, end);
> >> +		if (ret) {
> >> +			filemap_invalidate_unlock(mapping);
> >> +			goto out_mutex;
> >>  		}
> >>  
> >> -		/* Now release the pages and zero block aligned part of pages */
> >> -		truncate_pagecache_range(inode, start, end - 1);
> >>  		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> >>  
> >>  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index 89aade6f45f6..c68a8b841148 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -31,6 +31,7 @@
> >>  #include <linux/writeback.h>
> >>  #include <linux/pagevec.h>
> >>  #include <linux/mpage.h>
> >> +#include <linux/rmap.h>
> >>  #include <linux/namei.h>
> >>  #include <linux/uio.h>
> >>  #include <linux/bio.h>
> >> @@ -3902,6 +3903,67 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
> >>  	return ret;
> >>  }
> >>  
> >> +static inline void ext4_truncate_folio(struct inode *inode,
> >> +				       loff_t start, loff_t end)
> >> +{
> >> +	unsigned long blocksize = i_blocksize(inode);
> >> +	struct folio *folio;
> >> +
> >> +	/* Nothing to be done if no complete block needs to be truncated. */
> >> +	if (round_up(start, blocksize) >= round_down(end, blocksize))
> >> +		return;
> >> +
> >> +	folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
> >> +	if (IS_ERR(folio))
> >> +		return;
> >> +
> >> +	if (folio_mkclean(folio))
> >> +		folio_mark_dirty(folio);
> >> +	folio_unlock(folio);
> >> +	folio_put(folio);
> >> +}
> >> +
> >> +int ext4_truncate_page_cache_block_range(struct inode *inode,
> >> +					 loff_t start, loff_t end)
> >> +{
> >> +	unsigned long blocksize = i_blocksize(inode);
> >> +	int ret;
> >> +
> >> +	/*
> >> +	 * For journalled data we need to write (and checkpoint) pages
> >> +	 * before discarding page cache to avoid inconsitent data on disk
> >> +	 * in case of crash before freeing or unwritten converting trans
> >> +	 * is committed.
> >> +	 */
> >> +	if (ext4_should_journal_data(inode)) {
> >> +		ret = filemap_write_and_wait_range(inode->i_mapping, start,
> >> +						   end - 1);
> >> +		if (ret)
> >> +			return ret;
> >> +		goto truncate_pagecache;
> >> +	}
> >> +
> >> +	/*
> >> +	 * If the block size is less than the page size, the file's mapped
> >> +	 * blocks within one page could be freed or converted to unwritten.
> >> +	 * So it's necessary to remove writable userspace mappings, and then
> >> +	 * ext4_page_mkwrite() can be called during subsequent write access
> >> +	 * to these partial folios.
> >> +	 */
> >> +	if (blocksize < PAGE_SIZE && start < inode->i_size) {
> > 
> > Maybe we should only call ext4_truncate_folio() if the range is not page
> > aligned, rather than calling it everytime for bs < ps?
> 
> I agree with you, so how about below?
> 
> 	if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
> 	    blocksize < PAGE_SIZE && start < inode->i_size && )

This looks good Zhang, with this change and the variable rename, feel free to add

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Regards,
ojaswin
> 
> > 
> >> +		loff_t start_boundary = round_up(start, PAGE_SIZE);
> > 
> > I think page_boundary seems like a more suitable name for the variable.
> 
> Yeah, it looks fine to me.
> 
> Thanks,
> Yi.
>
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 74f2071189b2..8843929b46ce 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3016,6 +3016,8 @@  extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
 extern int ext4_break_layouts(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+						loff_t start, loff_t end);
 extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern void ext4_set_inode_flags(struct inode *, bool init);
 extern int ext4_alloc_da_blocks(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a07a98a4b97a..8dc6b4271b15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4667,22 +4667,13 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 			goto out_mutex;
 		}
 
-		/*
-		 * For journalled data we need to write (and checkpoint) pages
-		 * before discarding page cache to avoid inconsitent data on
-		 * disk in case of crash before zeroing trans is committed.
-		 */
-		if (ext4_should_journal_data(inode)) {
-			ret = filemap_write_and_wait_range(mapping, start,
-							   end - 1);
-			if (ret) {
-				filemap_invalidate_unlock(mapping);
-				goto out_mutex;
-			}
+		/* Now release the pages and zero block aligned part of pages */
+		ret = ext4_truncate_page_cache_block_range(inode, start, end);
+		if (ret) {
+			filemap_invalidate_unlock(mapping);
+			goto out_mutex;
 		}
 
-		/* Now release the pages and zero block aligned part of pages */
-		truncate_pagecache_range(inode, start, end - 1);
 		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89aade6f45f6..c68a8b841148 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@ 
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/rmap.h>
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
@@ -3902,6 +3903,67 @@  int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
 	return ret;
 }
 
+static inline void ext4_truncate_folio(struct inode *inode,
+				       loff_t start, loff_t end)
+{
+	unsigned long blocksize = i_blocksize(inode);
+	struct folio *folio;
+
+	/* Nothing to be done if no complete block needs to be truncated. */
+	if (round_up(start, blocksize) >= round_down(end, blocksize))
+		return;
+
+	folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+	if (IS_ERR(folio))
+		return;
+
+	if (folio_mkclean(folio))
+		folio_mark_dirty(folio);
+	folio_unlock(folio);
+	folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+					 loff_t start, loff_t end)
+{
+	unsigned long blocksize = i_blocksize(inode);
+	int ret;
+
+	/*
+	 * For journalled data we need to write (and checkpoint) pages
+	 * before discarding page cache to avoid inconsitent data on disk
+	 * in case of crash before freeing or unwritten converting trans
+	 * is committed.
+	 */
+	if (ext4_should_journal_data(inode)) {
+		ret = filemap_write_and_wait_range(inode->i_mapping, start,
+						   end - 1);
+		if (ret)
+			return ret;
+		goto truncate_pagecache;
+	}
+
+	/*
+	 * If the block size is less than the page size, the file's mapped
+	 * blocks within one page could be freed or converted to unwritten.
+	 * So it's necessary to remove writable userspace mappings, and then
+	 * ext4_page_mkwrite() can be called during subsequent write access
+	 * to these partial folios.
+	 */
+	if (blocksize < PAGE_SIZE && start < inode->i_size) {
+		loff_t start_boundary = round_up(start, PAGE_SIZE);
+
+		ext4_truncate_folio(inode, start, min(start_boundary, end));
+		if (end > start_boundary)
+			ext4_truncate_folio(inode,
+					    round_down(end, PAGE_SIZE), end);
+	}
+
+truncate_pagecache:
+	truncate_pagecache_range(inode, start, end - 1);
+	return 0;
+}
+
 static void ext4_wait_dax_page(struct inode *inode)
 {
 	filemap_invalidate_unlock(inode->i_mapping);