diff mbox series

[PATCHv11,8/8] iomap: Add per-block dirty state tracking to improve performance

Message ID bb0c58bf80dcdec96d7387bc439925fb14a5a496.1688188958.git.ritesh.list@gmail.com (mailing list archive)
State New, archived
Headers show
Series iomap: Add support for per-block dirty state to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) July 1, 2023, 7:34 a.m. UTC
When filesystem blocksize is less than folio size (either with
mapping_large_folio_support() or with blocksize < pagesize) and when the
folio is uptodate in pagecache, then even a byte write can cause
an entire folio to be written to disk during writeback. This happens
because we currently don't have a mechanism to track per-block dirty
state within struct iomap_folio_state. We currently only track uptodate
state.

This patch implements support for tracking per-block dirty state in
iomap_folio_state->state bitmap. This should help improve the filesystem
write performance and help reduce write amplification.

Performance testing of below fio workload reveals ~16x performance
improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves
   their database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/gfs2/aops.c         |   2 +-
 fs/iomap/buffered-io.c | 149 ++++++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_aops.c      |   2 +-
 fs/zonefs/file.c       |   2 +-
 include/linux/iomap.h  |   1 +
 5 files changed, 142 insertions(+), 14 deletions(-)

Comments

Ritesh Harjani (IBM) July 6, 2023, 2:46 p.m. UTC | #1
"Ritesh Harjani (IBM)" <ritesh.list@gmail.com> writes:

> @@ -1637,7 +1758,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct inode *inode,
>  		struct folio *folio, u64 end_pos)
>  {
> -	struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
> +	struct iomap_folio_state *ifs = folio->private;
>  	struct iomap_ioend *ioend, *next;
>  	unsigned len = i_blocksize(inode);
>  	unsigned nblocks = i_blocks_per_folio(inode, folio);
> @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	int error = 0, count = 0, i;
>  	LIST_HEAD(submit_list);
>  
> +	if (!ifs && nblocks > 1) {
> +		ifs = ifs_alloc(inode, folio, 0);
> +		iomap_set_range_dirty(folio, 0, folio_size(folio));
> +	}
> +
>  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
>  
>  	/*
> @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (ifs && !ifs_block_is_uptodate(ifs, i))
> +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
>  			continue;
>  
>  		error = wpc->ops->map_blocks(wpc, inode, pos);
> @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		}
>  	}
>  
> +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>  	folio_start_writeback(folio);
>  	folio_unlock(folio);
>  

I think we should fold below change with this patch. 
end_pos is calculated in iomap_do_writepage() such that it is either
folio_pos(folio) + folio_size(folio), or if this value becomes more then
isize, than end_pos is made isize.

The current patch does not have a functional problem I guess. But in
some cases where truncate races with writeback, it will end up marking
more bits & later doesn't clear those. Hence I think we should correct
it using below diff.

I have added a WARN_ON_ONCE, but if you think it is obvious and not
required, feel free to drop it.

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2fd9413838de..6c03e5842d44 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1766,9 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
        int error = 0, count = 0, i;
        LIST_HEAD(submit_list);

+       WARN_ON_ONCE(end_pos <= pos);
+
        if (!ifs && nblocks > 1) {
                ifs = ifs_alloc(inode, folio, 0);
-               iomap_set_range_dirty(folio, 0, folio_size(folio));
+               iomap_set_range_dirty(folio, 0, end_pos - pos);
        }

        WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);

-ritesh
Matthew Wilcox (Oracle) July 6, 2023, 5:42 p.m. UTC | #2
On Thu, Jul 06, 2023 at 08:16:05PM +0530, Ritesh Harjani wrote:
> > @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  	int error = 0, count = 0, i;
> >  	LIST_HEAD(submit_list);
> >  
> > +	if (!ifs && nblocks > 1) {
> > +		ifs = ifs_alloc(inode, folio, 0);
> > +		iomap_set_range_dirty(folio, 0, folio_size(folio));
> > +	}
> > +
> >  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
> >  
> >  	/*
> > @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  	 * invalid, grab a new one.
> >  	 */
> >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> > -		if (ifs && !ifs_block_is_uptodate(ifs, i))
> > +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
> >  			continue;
> >  
> >  		error = wpc->ops->map_blocks(wpc, inode, pos);
> > @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  		}
> >  	}
> >  
> > +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> >  	folio_start_writeback(folio);
> >  	folio_unlock(folio);
> >  
> 
> I think we should fold below change with this patch. 
> end_pos is calculated in iomap_do_writepage() such that it is either
> folio_pos(folio) + folio_size(folio), or if this value becomes more then
> isize, than end_pos is made isize.
> 
> The current patch does not have a functional problem I guess. But in
> some cases where truncate races with writeback, it will end up marking
> more bits & later doesn't clear those. Hence I think we should correct
> it using below diff.

I don't think this is the only place where we'll set dirty bits beyond
EOF.  For example, if we mmap the last partial folio in a file,
page_mkwrite will dirty the entire folio, but we won't write back
blocks past EOF.  I think we'd be better off clearing all the dirty
bits in the folio, even the ones past EOF.  What do you think?
Dave Chinner July 6, 2023, 10:16 p.m. UTC | #3
On Thu, Jul 06, 2023 at 06:42:36PM +0100, Matthew Wilcox wrote:
> On Thu, Jul 06, 2023 at 08:16:05PM +0530, Ritesh Harjani wrote:
> > > @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > >  	int error = 0, count = 0, i;
> > >  	LIST_HEAD(submit_list);
> > >  
> > > +	if (!ifs && nblocks > 1) {
> > > +		ifs = ifs_alloc(inode, folio, 0);
> > > +		iomap_set_range_dirty(folio, 0, folio_size(folio));
> > > +	}
> > > +
> > >  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
> > >  
> > >  	/*
> > > @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > >  	 * invalid, grab a new one.
> > >  	 */
> > >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> > > -		if (ifs && !ifs_block_is_uptodate(ifs, i))
> > > +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
> > >  			continue;
> > >  
> > >  		error = wpc->ops->map_blocks(wpc, inode, pos);
> > > @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > >  		}
> > >  	}
> > >  
> > > +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> > >  	folio_start_writeback(folio);
> > >  	folio_unlock(folio);
> > >  
> > 
> > I think we should fold below change with this patch. 
> > end_pos is calculated in iomap_do_writepage() such that it is either
> > folio_pos(folio) + folio_size(folio), or if this value becomes more then
> > isize, than end_pos is made isize.
> > 
> > The current patch does not have a functional problem I guess. But in
> > some cases where truncate races with writeback, it will end up marking
> > more bits & later doesn't clear those. Hence I think we should correct
> > it using below diff.
> 
> I don't think this is the only place where we'll set dirty bits beyond
> EOF.  For example, if we mmap the last partial folio in a file,
> page_mkwrite will dirty the entire folio, but we won't write back
> blocks past EOF.  I think we'd be better off clearing all the dirty
> bits in the folio, even the ones past EOF.  What do you think?

Clear the dirty bits beyond EOF where we zero the data range beyond
EOF in iomap_do_writepage() via folio_zero_segment()?

-Dave.
Matthew Wilcox (Oracle) July 6, 2023, 11:54 p.m. UTC | #4
On Fri, Jul 07, 2023 at 08:16:17AM +1000, Dave Chinner wrote:
> On Thu, Jul 06, 2023 at 06:42:36PM +0100, Matthew Wilcox wrote:
> > On Thu, Jul 06, 2023 at 08:16:05PM +0530, Ritesh Harjani wrote:
> > > > @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > > >  	int error = 0, count = 0, i;
> > > >  	LIST_HEAD(submit_list);
> > > >  
> > > > +	if (!ifs && nblocks > 1) {
> > > > +		ifs = ifs_alloc(inode, folio, 0);
> > > > +		iomap_set_range_dirty(folio, 0, folio_size(folio));
> > > > +	}
> > > > +
> > > >  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
> > > >  
> > > >  	/*
> > > > @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > > >  	 * invalid, grab a new one.
> > > >  	 */
> > > >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> > > > -		if (ifs && !ifs_block_is_uptodate(ifs, i))
> > > > +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
> > > >  			continue;
> > > >  
> > > >  		error = wpc->ops->map_blocks(wpc, inode, pos);
> > > > @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > > >  		}
> > > >  	}
> > > >  
> > > > +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> > > >  	folio_start_writeback(folio);
> > > >  	folio_unlock(folio);
> > > >  
> > > 
> > > I think we should fold below change with this patch. 
> > > end_pos is calculated in iomap_do_writepage() such that it is either
> > > folio_pos(folio) + folio_size(folio), or if this value becomes more then
> > > isize, than end_pos is made isize.
> > > 
> > > The current patch does not have a functional problem I guess. But in
> > > some cases where truncate races with writeback, it will end up marking
> > > more bits & later doesn't clear those. Hence I think we should correct
> > > it using below diff.
> > 
> > I don't think this is the only place where we'll set dirty bits beyond
> > EOF.  For example, if we mmap the last partial folio in a file,
> > page_mkwrite will dirty the entire folio, but we won't write back
> > blocks past EOF.  I think we'd be better off clearing all the dirty
> > bits in the folio, even the ones past EOF.  What do you think?
> 
> Clear the dirty bits beyond EOF where we zero the data range beyond
> EOF in iomap_do_writepage() via folio_zero_segment()?

That would work, but I think it's simpler to change:

-	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
+	iomap_clear_range_dirty(folio, 0, folio_size(folio));
Ritesh Harjani (IBM) July 10, 2023, 6:19 p.m. UTC | #5
Matthew Wilcox <willy@infradead.org> writes:

Sorry for the delayed response. I am currently on travel.

> On Fri, Jul 07, 2023 at 08:16:17AM +1000, Dave Chinner wrote:
>> On Thu, Jul 06, 2023 at 06:42:36PM +0100, Matthew Wilcox wrote:
>> > On Thu, Jul 06, 2023 at 08:16:05PM +0530, Ritesh Harjani wrote:
>> > > > @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>> > > >  	int error = 0, count = 0, i;
>> > > >  	LIST_HEAD(submit_list);
>> > > >  
>> > > > +	if (!ifs && nblocks > 1) {
>> > > > +		ifs = ifs_alloc(inode, folio, 0);
>> > > > +		iomap_set_range_dirty(folio, 0, folio_size(folio));
>> > > > +	}
>> > > > +
>> > > >  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
>> > > >  
>> > > >  	/*
>> > > > @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>> > > >  	 * invalid, grab a new one.
>> > > >  	 */
>> > > >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
>> > > > -		if (ifs && !ifs_block_is_uptodate(ifs, i))
>> > > > +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
>> > > >  			continue;
>> > > >  
>> > > >  		error = wpc->ops->map_blocks(wpc, inode, pos);
>> > > > @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>> > > >  		}
>> > > >  	}
>> > > >  
>> > > > +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>> > > >  	folio_start_writeback(folio);
>> > > >  	folio_unlock(folio);
>> > > >  
>> > > 
>> > > I think we should fold below change with this patch. 
>> > > end_pos is calculated in iomap_do_writepage() such that it is either
>> > > folio_pos(folio) + folio_size(folio), or if this value becomes more then
>> > > isize, than end_pos is made isize.
>> > > 
>> > > The current patch does not have a functional problem I guess. But in
>> > > some cases where truncate races with writeback, it will end up marking
>> > > more bits & later doesn't clear those. Hence I think we should correct
>> > > it using below diff.
>> > 
>> > I don't think this is the only place where we'll set dirty bits beyond
>> > EOF.  For example, if we mmap the last partial folio in a file,
>> > page_mkwrite will dirty the entire folio, but we won't write back
>> > blocks past EOF.  I think we'd be better off clearing all the dirty
>> > bits in the folio, even the ones past EOF.  What do you think?

Yup. I agree, it's better that way to clear all dirty bits in the folio.
Thanks for the suggestion & nice catch!! 

>> 
>> Clear the dirty bits beyond EOF where we zero the data range beyond
>> EOF in iomap_do_writepage() via folio_zero_segment()?
>
> That would work, but I think it's simpler to change:
>
> -	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> +	iomap_clear_range_dirty(folio, 0, folio_size(folio));

Right. 

@Darrick,
IMO, we should fold below change with Patch-8. If you like I can send a v12
with this change. I re-tested 1k-blocksize fstests on x86 with
below changes included and didn't find any surprise. Also v11 series
including the below folded change is cleanly applicable on your
iomap-for-next branch.


diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index b6280e053d68..de212b6fe467 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1766,9 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
        int error = 0, count = 0, i;
        LIST_HEAD(submit_list);

+       WARN_ON_ONCE(end_pos <= pos);
+
        if (!ifs && nblocks > 1) {
                ifs = ifs_alloc(inode, folio, 0);
-               iomap_set_range_dirty(folio, 0, folio_size(folio));
+               iomap_set_range_dirty(folio, 0, end_pos - pos);
        }

        WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
@@ -1823,7 +1825,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
                }
        }

-       iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
+       /*
+        * We can have dirty bits set past end of file in page_mkwrite path
+        * while mapping the last partial folio. Hence it's better to clear
+        * all the dirty bits in the folio here.
+        */
+       iomap_clear_range_dirty(folio, 0, folio_size(folio));
        folio_start_writeback(folio);
        folio_unlock(folio);

--
2.30.2


-ritesh
Darrick J. Wong July 13, 2023, 4:36 a.m. UTC | #6
On Sat, Jul 01, 2023 at 01:04:41PM +0530, Ritesh Harjani (IBM) wrote:
> When filesystem blocksize is less than folio size (either with
> mapping_large_folio_support() or with blocksize < pagesize) and when the
> folio is uptodate in pagecache, then even a byte write can cause
> an entire folio to be written to disk during writeback. This happens
> because we currently don't have a mechanism to track per-block dirty
> state within struct iomap_folio_state. We currently only track uptodate
> state.
> 
> This patch implements support for tracking per-block dirty state in
> iomap_folio_state->state bitmap. This should help improve the filesystem
> write performance and help reduce write amplification.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> 1. <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> 2. Also our internal performance team reported that this patch improves
>    their database workload performance by around ~83% (with XFS on Power)
> 
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Reported-by: Brian Foster <bfoster@redhat.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/gfs2/aops.c         |   2 +-
>  fs/iomap/buffered-io.c | 149 ++++++++++++++++++++++++++++++++++++++---
>  fs/xfs/xfs_aops.c      |   2 +-
>  fs/zonefs/file.c       |   2 +-
>  include/linux/iomap.h  |   1 +
>  5 files changed, 142 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index a5f4be6b9213..75efec3c3b71 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -746,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
>  	.writepages = gfs2_writepages,
>  	.read_folio = gfs2_read_folio,
>  	.readahead = gfs2_readahead,
> -	.dirty_folio = filemap_dirty_folio,
> +	.dirty_folio = iomap_dirty_folio,
>  	.release_folio = iomap_release_folio,
>  	.invalidate_folio = iomap_invalidate_folio,
>  	.bmap = gfs2_bmap,
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index fb6c2b6a4358..2fd9413838de 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -25,7 +25,7 @@
>  
>  typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>  /*
> - * Structure allocated for each folio to track per-block uptodate state
> + * Structure allocated for each folio to track per-block uptodate, dirty state
>   * and I/O completions.
>   */
>  struct iomap_folio_state {
> @@ -78,6 +78,61 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
>  		folio_mark_uptodate(folio);
>  }
>  
> +static inline bool ifs_block_is_dirty(struct folio *folio,
> +		struct iomap_folio_state *ifs, int block)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +
> +	return test_bit(block + blks_per_folio, ifs->state);
> +}
> +
> +static void ifs_clear_range_dirty(struct folio *folio,
> +		struct iomap_folio_state *ifs, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = (off >> inode->i_blkbits);
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&ifs->state_lock, flags);
> +	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);

Much clearer now that these have been simplified.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D


> +	spin_unlock_irqrestore(&ifs->state_lock, flags);
> +}
> +
> +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
> +{
> +	struct iomap_folio_state *ifs = folio->private;
> +
> +	if (ifs)
> +		ifs_clear_range_dirty(folio, ifs, off, len);
> +}
> +
> +static void ifs_set_range_dirty(struct folio *folio,
> +		struct iomap_folio_state *ifs, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = (off >> inode->i_blkbits);
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&ifs->state_lock, flags);
> +	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
> +	spin_unlock_irqrestore(&ifs->state_lock, flags);
> +}
> +
> +static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
> +{
> +	struct iomap_folio_state *ifs = folio->private;
> +
> +	if (ifs)
> +		ifs_set_range_dirty(folio, ifs, off, len);
> +}
> +
>  static struct iomap_folio_state *ifs_alloc(struct inode *inode,
>  		struct folio *folio, unsigned int flags)
>  {
> @@ -93,14 +148,24 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
>  	else
>  		gfp = GFP_NOFS | __GFP_NOFAIL;
>  
> -	ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
> -		      gfp);
> -	if (ifs) {
> -		spin_lock_init(&ifs->state_lock);
> -		if (folio_test_uptodate(folio))
> -			bitmap_fill(ifs->state, nr_blocks);
> -		folio_attach_private(folio, ifs);
> -	}
> +	/*
> +	 * ifs->state tracks two sets of state flags when the
> +	 * filesystem block size is smaller than the folio size.
> +	 * The first state tracks per-block uptodate and the
> +	 * second tracks per-block dirty state.
> +	 */
> +	ifs = kzalloc(struct_size(ifs, state,
> +		      BITS_TO_LONGS(2 * nr_blocks)), gfp);
> +	if (!ifs)
> +		return ifs;
> +
> +	spin_lock_init(&ifs->state_lock);
> +	if (folio_test_uptodate(folio))
> +		bitmap_set(ifs->state, 0, nr_blocks);
> +	if (folio_test_dirty(folio))
> +		bitmap_set(ifs->state, nr_blocks, nr_blocks);
> +	folio_attach_private(folio, ifs);
> +
>  	return ifs;
>  }
>  
> @@ -523,6 +588,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
>  }
>  EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
>  
> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
> +{
> +	struct inode *inode = mapping->host;
> +	size_t len = folio_size(folio);
> +
> +	ifs_alloc(inode, folio, 0);
> +	iomap_set_range_dirty(folio, 0, len);
> +	return filemap_dirty_folio(mapping, folio);
> +}
> +EXPORT_SYMBOL_GPL(iomap_dirty_folio);
> +
>  static void
>  iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
>  {
> @@ -727,6 +803,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
>  		return 0;
>  	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
> +	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
>  	filemap_dirty_folio(inode->i_mapping, folio);
>  	return copied;
>  }
> @@ -891,6 +968,43 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
>  }
>  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
>  
> +static int iomap_write_delalloc_ifs_punch(struct inode *inode,
> +		struct folio *folio, loff_t start_byte, loff_t end_byte,
> +		iomap_punch_t punch)
> +{
> +	unsigned int first_blk, last_blk, i;
> +	loff_t last_byte;
> +	u8 blkbits = inode->i_blkbits;
> +	struct iomap_folio_state *ifs;
> +	int ret = 0;
> +
> +	/*
> +	 * When we have per-block dirty tracking, there can be
> +	 * blocks within a folio which are marked uptodate
> +	 * but not dirty. In that case it is necessary to punch
> +	 * out such blocks to avoid leaking any delalloc blocks.
> +	 */
> +	ifs = folio->private;
> +	if (!ifs)
> +		return ret;
> +
> +	last_byte = min_t(loff_t, end_byte - 1,
> +			folio_pos(folio) + folio_size(folio) - 1);
> +	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
> +	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
> +	for (i = first_blk; i <= last_blk; i++) {
> +		if (!ifs_block_is_dirty(folio, ifs, i)) {
> +			ret = punch(inode, folio_pos(folio) + (i << blkbits),
> +				    1 << blkbits);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +
>  static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>  		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
>  		iomap_punch_t punch)
> @@ -907,6 +1021,13 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>  		if (ret)
>  			return ret;
>  	}
> +
> +	/* Punch non-dirty blocks within folio */
> +	ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
> +			end_byte, punch);
> +	if (ret)
> +		return ret;
> +
>  	/*
>  	 * Make sure the next punch start is correctly bound to
>  	 * the end of this data range, not the end of the folio.
> @@ -1637,7 +1758,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct inode *inode,
>  		struct folio *folio, u64 end_pos)
>  {
> -	struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
> +	struct iomap_folio_state *ifs = folio->private;
>  	struct iomap_ioend *ioend, *next;
>  	unsigned len = i_blocksize(inode);
>  	unsigned nblocks = i_blocks_per_folio(inode, folio);
> @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	int error = 0, count = 0, i;
>  	LIST_HEAD(submit_list);
>  
> +	if (!ifs && nblocks > 1) {
> +		ifs = ifs_alloc(inode, folio, 0);
> +		iomap_set_range_dirty(folio, 0, folio_size(folio));
> +	}
> +
>  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
>  
>  	/*
> @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (ifs && !ifs_block_is_uptodate(ifs, i))
> +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
>  			continue;
>  
>  		error = wpc->ops->map_blocks(wpc, inode, pos);
> @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		}
>  	}
>  
> +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>  	folio_start_writeback(folio);
>  	folio_unlock(folio);
>  
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 451942fb38ec..2fca4b4e7fd8 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
>  	.read_folio		= xfs_vm_read_folio,
>  	.readahead		= xfs_vm_readahead,
>  	.writepages		= xfs_vm_writepages,
> -	.dirty_folio		= filemap_dirty_folio,
> +	.dirty_folio		= iomap_dirty_folio,
>  	.release_folio		= iomap_release_folio,
>  	.invalidate_folio	= iomap_invalidate_folio,
>  	.bmap			= xfs_vm_bmap,
> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
> index 132f01d3461f..e508c8e97372 100644
> --- a/fs/zonefs/file.c
> +++ b/fs/zonefs/file.c
> @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
>  	.read_folio		= zonefs_read_folio,
>  	.readahead		= zonefs_readahead,
>  	.writepages		= zonefs_writepages,
> -	.dirty_folio		= filemap_dirty_folio,
> +	.dirty_folio		= iomap_dirty_folio,
>  	.release_folio		= iomap_release_folio,
>  	.invalidate_folio	= iomap_invalidate_folio,
>  	.migrate_folio		= filemap_migrate_folio,
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index e2b836c2e119..eb9335c46bf3 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
>  struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
>  bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
>  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
>  int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
>  		const struct iomap_ops *ops);
>  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
> -- 
> 2.40.1
>
Darrick J. Wong July 13, 2023, 4:38 a.m. UTC | #7
On Mon, Jul 10, 2023 at 11:49:15PM +0530, Ritesh Harjani wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
> Sorry for the delayed response. I am currently on travel.
> 
> > On Fri, Jul 07, 2023 at 08:16:17AM +1000, Dave Chinner wrote:
> >> On Thu, Jul 06, 2023 at 06:42:36PM +0100, Matthew Wilcox wrote:
> >> > On Thu, Jul 06, 2023 at 08:16:05PM +0530, Ritesh Harjani wrote:
> >> > > > @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >> > > >  	int error = 0, count = 0, i;
> >> > > >  	LIST_HEAD(submit_list);
> >> > > >  
> >> > > > +	if (!ifs && nblocks > 1) {
> >> > > > +		ifs = ifs_alloc(inode, folio, 0);
> >> > > > +		iomap_set_range_dirty(folio, 0, folio_size(folio));
> >> > > > +	}
> >> > > > +
> >> > > >  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
> >> > > >  
> >> > > >  	/*
> >> > > > @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >> > > >  	 * invalid, grab a new one.
> >> > > >  	 */
> >> > > >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> >> > > > -		if (ifs && !ifs_block_is_uptodate(ifs, i))
> >> > > > +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
> >> > > >  			continue;
> >> > > >  
> >> > > >  		error = wpc->ops->map_blocks(wpc, inode, pos);
> >> > > > @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >> > > >  		}
> >> > > >  	}
> >> > > >  
> >> > > > +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> >> > > >  	folio_start_writeback(folio);
> >> > > >  	folio_unlock(folio);
> >> > > >  
> >> > > 
> >> > > I think we should fold below change with this patch. 
> >> > > end_pos is calculated in iomap_do_writepage() such that it is either
> >> > > folio_pos(folio) + folio_size(folio), or if this value becomes more then
> >> > > isize, than end_pos is made isize.
> >> > > 
> >> > > The current patch does not have a functional problem I guess. But in
> >> > > some cases where truncate races with writeback, it will end up marking
> >> > > more bits & later doesn't clear those. Hence I think we should correct
> >> > > it using below diff.
> >> > 
> >> > I don't think this is the only place where we'll set dirty bits beyond
> >> > EOF.  For example, if we mmap the last partial folio in a file,
> >> > page_mkwrite will dirty the entire folio, but we won't write back
> >> > blocks past EOF.  I think we'd be better off clearing all the dirty
> >> > bits in the folio, even the ones past EOF.  What do you think?
> 
> Yup. I agree, it's better that way to clear all dirty bits in the folio.
> Thanks for the suggestion & nice catch!! 
> 
> >> 
> >> Clear the dirty bits beyond EOF where we zero the data range beyond
> >> EOF in iomap_do_writepage() via folio_zero_segment()?
> >
> > That would work, but I think it's simpler to change:
> >
> > -	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> > +	iomap_clear_range_dirty(folio, 0, folio_size(folio));
> 
> Right. 
> 
> @Darrick,
> IMO, we should fold below change with Patch-8. If you like I can send a v12
> with this change. I re-tested 1k-blocksize fstests on x86 with
> below changes included and didn't find any surprise. Also v11 series
> including the below folded change is cleanly applicable on your
> iomap-for-next branch.

Yes, please fold this into v12.  I think Matthew might want to get these
iomap folio changes out to for-next even sooner than -rc4.  If there's
time during this week's ext4 call, let's talk about that.

--D

> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index b6280e053d68..de212b6fe467 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1766,9 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>         int error = 0, count = 0, i;
>         LIST_HEAD(submit_list);
> 
> +       WARN_ON_ONCE(end_pos <= pos);
> +
>         if (!ifs && nblocks > 1) {
>                 ifs = ifs_alloc(inode, folio, 0);
> -               iomap_set_range_dirty(folio, 0, folio_size(folio));
> +               iomap_set_range_dirty(folio, 0, end_pos - pos);
>         }
> 
>         WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
> @@ -1823,7 +1825,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>                 }
>         }
> 
> -       iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
> +       /*
> +        * We can have dirty bits set past end of file in page_mkwrite path
> +        * while mapping the last partial folio. Hence it's better to clear
> +        * all the dirty bits in the folio here.
> +        */
> +       iomap_clear_range_dirty(folio, 0, folio_size(folio));
>         folio_start_writeback(folio);
>         folio_unlock(folio);
> 
> --
> 2.30.2
> 
> 
> -ritesh
Ritesh Harjani (IBM) July 13, 2023, 5:27 a.m. UTC | #8
"Darrick J. Wong" <djwong@kernel.org> writes:

> On Mon, Jul 10, 2023 at 11:49:15PM +0530, Ritesh Harjani wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>> 
>> Sorry for the delayed response. I am currently on travel.
>> 
>> > On Fri, Jul 07, 2023 at 08:16:17AM +1000, Dave Chinner wrote:
>> >> On Thu, Jul 06, 2023 at 06:42:36PM +0100, Matthew Wilcox wrote:
>> >> > On Thu, Jul 06, 2023 at 08:16:05PM +0530, Ritesh Harjani wrote:
>> >> > > > @@ -1645,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>> >> > > >  	int error = 0, count = 0, i;
>> >> > > >  	LIST_HEAD(submit_list);
>> >> > > >  
>> >> > > > +	if (!ifs && nblocks > 1) {
>> >> > > > +		ifs = ifs_alloc(inode, folio, 0);
>> >> > > > +		iomap_set_range_dirty(folio, 0, folio_size(folio));
>> >> > > > +	}
>> >> > > > +
>> >> > > >  	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
>> >> > > >  
>> >> > > >  	/*
>> >> > > > @@ -1653,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>> >> > > >  	 * invalid, grab a new one.
>> >> > > >  	 */
>> >> > > >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
>> >> > > > -		if (ifs && !ifs_block_is_uptodate(ifs, i))
>> >> > > > +		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
>> >> > > >  			continue;
>> >> > > >  
>> >> > > >  		error = wpc->ops->map_blocks(wpc, inode, pos);
>> >> > > > @@ -1697,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>> >> > > >  		}
>> >> > > >  	}
>> >> > > >  
>> >> > > > +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>> >> > > >  	folio_start_writeback(folio);
>> >> > > >  	folio_unlock(folio);
>> >> > > >  
>> >> > > 
>> >> > > I think we should fold below change with this patch. 
>> >> > > end_pos is calculated in iomap_do_writepage() such that it is either
>> >> > > folio_pos(folio) + folio_size(folio), or if this value becomes more then
>> >> > > isize, than end_pos is made isize.
>> >> > > 
>> >> > > The current patch does not have a functional problem I guess. But in
>> >> > > some cases where truncate races with writeback, it will end up marking
>> >> > > more bits & later doesn't clear those. Hence I think we should correct
>> >> > > it using below diff.
>> >> > 
>> >> > I don't think this is the only place where we'll set dirty bits beyond
>> >> > EOF.  For example, if we mmap the last partial folio in a file,
>> >> > page_mkwrite will dirty the entire folio, but we won't write back
>> >> > blocks past EOF.  I think we'd be better off clearing all the dirty
>> >> > bits in the folio, even the ones past EOF.  What do you think?
>> 
>> Yup. I agree, it's better that way to clear all dirty bits in the folio.
>> Thanks for the suggestion & nice catch!! 
>> 
>> >> 
>> >> Clear the dirty bits beyond EOF where we zero the data range beyond
>> >> EOF in iomap_do_writepage() via folio_zero_segment()?
>> >
>> > That would work, but I think it's simpler to change:
>> >
>> > -	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>> > +	iomap_clear_range_dirty(folio, 0, folio_size(folio));
>> 
>> Right. 
>> 
>> @Darrick,
>> IMO, we should fold below change with Patch-8. If you like I can send a v12
>> with this change. I re-tested 1k-blocksize fstests on x86 with
>> below changes included and didn't find any surprise. Also v11 series
>> including the below folded change is cleanly applicable on your
>> iomap-for-next branch.
>
> Yes, please fold this into v12.  I think Matthew might want to get these

sure, I can fold this into Patch-8 in v12 then. I need to also rebase it
on top of Matthew's changes then right? 

> iomap folio changes out to for-next even sooner than -rc4.  If there's
> time during this week's ext4 call, let's talk about that.

Sure. Post out call, I can prepare and send a v12.

-ritesh
diff mbox series

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a5f4be6b9213..75efec3c3b71 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -746,7 +746,7 @@  static const struct address_space_operations gfs2_aops = {
 	.writepages = gfs2_writepages,
 	.read_folio = gfs2_read_folio,
 	.readahead = gfs2_readahead,
-	.dirty_folio = filemap_dirty_folio,
+	.dirty_folio = iomap_dirty_folio,
 	.release_folio = iomap_release_folio,
 	.invalidate_folio = iomap_invalidate_folio,
 	.bmap = gfs2_bmap,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fb6c2b6a4358..2fd9413838de 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -25,7 +25,7 @@ 
 
 typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
 /*
- * Structure allocated for each folio to track per-block uptodate state
+ * Structure allocated for each folio to track per-block uptodate, dirty state
  * and I/O completions.
  */
 struct iomap_folio_state {
@@ -78,6 +78,61 @@  static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 		folio_mark_uptodate(folio);
 }
 
+static inline bool ifs_block_is_dirty(struct folio *folio,
+		struct iomap_folio_state *ifs, int block)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + blks_per_folio, ifs->state);
+}
+
+static void ifs_clear_range_dirty(struct folio *folio,
+		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = (off >> inode->i_blkbits);
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs)
+		ifs_clear_range_dirty(folio, ifs, off, len);
+}
+
+static void ifs_set_range_dirty(struct folio *folio,
+		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = (off >> inode->i_blkbits);
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs)
+		ifs_set_range_dirty(folio, ifs, off, len);
+}
+
 static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 		struct folio *folio, unsigned int flags)
 {
@@ -93,14 +148,24 @@  static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;
 
-	ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
-		      gfp);
-	if (ifs) {
-		spin_lock_init(&ifs->state_lock);
-		if (folio_test_uptodate(folio))
-			bitmap_fill(ifs->state, nr_blocks);
-		folio_attach_private(folio, ifs);
-	}
+	/*
+	 * ifs->state tracks two sets of state flags when the
+	 * filesystem block size is smaller than the folio size.
+	 * The first state tracks per-block uptodate and the
+	 * second tracks per-block dirty state.
+	 */
+	ifs = kzalloc(struct_size(ifs, state,
+		      BITS_TO_LONGS(2 * nr_blocks)), gfp);
+	if (!ifs)
+		return ifs;
+
+	spin_lock_init(&ifs->state_lock);
+	if (folio_test_uptodate(folio))
+		bitmap_set(ifs->state, 0, nr_blocks);
+	if (folio_test_dirty(folio))
+		bitmap_set(ifs->state, nr_blocks, nr_blocks);
+	folio_attach_private(folio, ifs);
+
 	return ifs;
 }
 
@@ -523,6 +588,17 @@  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct inode *inode = mapping->host;
+	size_t len = folio_size(folio);
+
+	ifs_alloc(inode, folio, 0);
+	iomap_set_range_dirty(folio, 0, len);
+	return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -727,6 +803,7 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return 0;
 	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
+	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -891,6 +968,43 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+		struct folio *folio, loff_t start_byte, loff_t end_byte,
+		iomap_punch_t punch)
+{
+	unsigned int first_blk, last_blk, i;
+	loff_t last_byte;
+	u8 blkbits = inode->i_blkbits;
+	struct iomap_folio_state *ifs;
+	int ret = 0;
+
+	/*
+	 * When we have per-block dirty tracking, there can be
+	 * blocks within a folio which are marked uptodate
+	 * but not dirty. In that case it is necessary to punch
+	 * out such blocks to avoid leaking any delalloc blocks.
+	 */
+	ifs = folio->private;
+	if (!ifs)
+		return ret;
+
+	last_byte = min_t(loff_t, end_byte - 1,
+			folio_pos(folio) + folio_size(folio) - 1);
+	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+	for (i = first_blk; i <= last_blk; i++) {
+		if (!ifs_block_is_dirty(folio, ifs, i)) {
+			ret = punch(inode, folio_pos(folio) + (i << blkbits),
+				    1 << blkbits);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+
 static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
 		iomap_punch_t punch)
@@ -907,6 +1021,13 @@  static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 		if (ret)
 			return ret;
 	}
+
+	/* Punch non-dirty blocks within folio */
+	ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
+			end_byte, punch);
+	if (ret)
+		return ret;
+
 	/*
 	 * Make sure the next punch start is correctly bound to
 	 * the end of this data range, not the end of the folio.
@@ -1637,7 +1758,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct inode *inode,
 		struct folio *folio, u64 end_pos)
 {
-	struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
+	struct iomap_folio_state *ifs = folio->private;
 	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1645,6 +1766,11 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	int error = 0, count = 0, i;
 	LIST_HEAD(submit_list);
 
+	if (!ifs && nblocks > 1) {
+		ifs = ifs_alloc(inode, folio, 0);
+		iomap_set_range_dirty(folio, 0, folio_size(folio));
+	}
+
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
 
 	/*
@@ -1653,7 +1779,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (ifs && !ifs_block_is_uptodate(ifs, i))
+		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
 			continue;
 
 		error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1697,6 +1823,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 	}
 
+	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 451942fb38ec..2fca4b4e7fd8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -578,7 +578,7 @@  const struct address_space_operations xfs_address_space_operations = {
 	.read_folio		= xfs_vm_read_folio,
 	.readahead		= xfs_vm_readahead,
 	.writepages		= xfs_vm_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.bmap			= xfs_vm_bmap,
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 132f01d3461f..e508c8e97372 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -175,7 +175,7 @@  const struct address_space_operations zonefs_file_aops = {
 	.read_folio		= zonefs_read_folio,
 	.readahead		= zonefs_readahead,
 	.writepages		= zonefs_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.migrate_folio		= filemap_migrate_folio,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..eb9335c46bf3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -264,6 +264,7 @@  bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,