diff mbox series

[PATCHv10,8/8] iomap: Add per-block dirty state tracking to improve performance

Message ID 6db62a08dda3a348303e2262454837149c2afe2a.1687140389.git.ritesh.list@gmail.com (mailing list archive)
State New, archived
Headers show
Series iomap: Add support for per-block dirty state to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) June 19, 2023, 2:28 a.m. UTC
When filesystem blocksize is less than folio size (either with
mapping_large_folio_support() or with blocksize < pagesize) and when the
folio is uptodate in pagecache, then even a byte write can cause
an entire folio to be written to disk during writeback. This happens
because we currently don't have a mechanism to track per-block dirty
state within struct iomap_folio_state. We currently only track uptodate
state.

This patch implements support for tracking per-block dirty state in
iomap_folio_state->state bitmap. This should help improve the filesystem
write performance and help reduce write amplification.

Performance testing of below fio workload reveals ~16x performance
improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves
   their database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/gfs2/aops.c         |   2 +-
 fs/iomap/buffered-io.c | 189 ++++++++++++++++++++++++++++++++++++-----
 fs/xfs/xfs_aops.c      |   2 +-
 fs/zonefs/file.c       |   2 +-
 include/linux/iomap.h  |   1 +
 5 files changed, 171 insertions(+), 25 deletions(-)

Comments

Andreas Gruenbacher June 19, 2023, 12:43 p.m. UTC | #1
On Mon, Jun 19, 2023 at 4:29 AM Ritesh Harjani (IBM)
<ritesh.list@gmail.com> wrote:
> When filesystem blocksize is less than folio size (either with
> mapping_large_folio_support() or with blocksize < pagesize) and when the
> folio is uptodate in pagecache, then even a byte write can cause
> an entire folio to be written to disk during writeback. This happens
> because we currently don't have a mechanism to track per-block dirty
> state within struct iomap_folio_state. We currently only track uptodate
> state.
>
> This patch implements support for tracking per-block dirty state in
> iomap_folio_state->state bitmap. This should help improve the filesystem
> write performance and help reduce write amplification.
>
> Performance testing of below fio workload reveals ~16x performance
> improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
>
> 1. <test_randwrite.fio>
> [global]
>         ioengine=psync
>         rw=randwrite
>         overwrite=1
>         pre_read=1
>         direct=0
>         bs=4k
>         size=1G
>         dir=./
>         numjobs=8
>         fdatasync=1
>         runtime=60
>         iodepth=64
>         group_reporting=1
>
> [fio-run]
>
> 2. Also our internal performance team reported that this patch improves
>    their database workload performance by around ~83% (with XFS on Power)
>
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Reported-by: Brian Foster <bfoster@redhat.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/gfs2/aops.c         |   2 +-
>  fs/iomap/buffered-io.c | 189 ++++++++++++++++++++++++++++++++++++-----
>  fs/xfs/xfs_aops.c      |   2 +-
>  fs/zonefs/file.c       |   2 +-
>  include/linux/iomap.h  |   1 +
>  5 files changed, 171 insertions(+), 25 deletions(-)
>
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index a5f4be6b9213..75efec3c3b71 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -746,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
>         .writepages = gfs2_writepages,
>         .read_folio = gfs2_read_folio,
>         .readahead = gfs2_readahead,
> -       .dirty_folio = filemap_dirty_folio,
> +       .dirty_folio = iomap_dirty_folio,
>         .release_folio = iomap_release_folio,
>         .invalidate_folio = iomap_invalidate_folio,
>         .bmap = gfs2_bmap,
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 391d918ddd22..50f5840bb5f9 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -25,7 +25,7 @@
>
>  typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>  /*
> - * Structure allocated for each folio to track per-block uptodate state
> + * Structure allocated for each folio to track per-block uptodate, dirty state
>   * and I/O completions.
>   */
>  struct iomap_folio_state {
> @@ -35,31 +35,55 @@ struct iomap_folio_state {
>         unsigned long           state[];
>  };
>
> +enum iomap_block_state {
> +       IOMAP_ST_UPTODATE,
> +       IOMAP_ST_DIRTY,
> +
> +       IOMAP_ST_MAX,
> +};
> +
> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
> +               enum iomap_block_state state, unsigned int *first_blkp,
> +               unsigned int *nr_blksp)
> +{
> +       struct inode *inode = folio->mapping->host;
> +       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +       unsigned int first = off >> inode->i_blkbits;
> +       unsigned int last = (off + len - 1) >> inode->i_blkbits;
> +
> +       *first_blkp = first + (state * blks_per_folio);
> +       *nr_blksp = last - first + 1;
> +}
> +
>  static struct bio_set iomap_ioend_bioset;
>
>  static inline bool ifs_is_fully_uptodate(struct folio *folio,
>                                                struct iomap_folio_state *ifs)
>  {
>         struct inode *inode = folio->mapping->host;
> +       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +       unsigned int nr_blks = (IOMAP_ST_UPTODATE + 1) * blks_per_folio;

This nr_blks calculation doesn't make sense.

> -       return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
> +       return bitmap_full(ifs->state, nr_blks);

Could you please change this to:

BUILD_BUG_ON(IOMAP_ST_UPTODATE != 0);
return bitmap_full(ifs->state, blks_per_folio);

Also, I'm seeing that the value of i_blocks_per_folio() is assigned to
local variables with various names in several places (blks_per_folio,
nr_blocks, nblocks). Maybe this could be made consistent.

>  }
>
> -static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
> -                                    blks_per_folio          unsigned int block)blks_per_folio
> +static inline bool ifs_block_is_uptodate(struct folio *folio,
> +               struct iomap_folio_state *ifs, unsigned int block)
>  {
> -       return test_bit(block, ifs->state);
> +       struct inode *inode = folio->mapping->host;
> +       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +
> +       return test_bit(block + IOMAP_ST_UPTODATE * blks_per_folio, ifs->state);
>  }
>
>  static void ifs_set_range_uptodate(struct folio *folio,
>                 struct iomap_folio_state *ifs, size_t off, size_t len)
>  {
> -       struct inode *inode = folio->mapping->host;
> -       unsigned int first_blk = off >> inode->i_blkbits;
> -       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> -       unsigned int nr_blks = last_blk - first_blk + 1;
> +       unsigned int first_blk, nr_blks;
>         unsigned long flags;
>
> +       ifs_calc_range(folio, off, len, IOMAP_ST_UPTODATE, &first_blk,
> +                            &nr_blks);
>         spin_lock_irqsave(&ifs->state_lock, flags);
>         bitmap_set(ifs->state, first_blk, nr_blks);
>         if (ifs_is_fully_uptodate(folio, ifs))
> @@ -78,6 +102,55 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
>                 folio_mark_uptodate(folio);
>  }
>
> +static inline bool ifs_block_is_dirty(struct folio *folio,
> +               struct iomap_folio_state *ifs, int block)
> +{
> +       struct inode *inode = folio->mapping->host;
> +       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +
> +       return test_bit(block + IOMAP_ST_DIRTY * blks_per_folio, ifs->state);
> +}
> +
> +static void ifs_clear_range_dirty(struct folio *folio,
> +               struct iomap_folio_state *ifs, size_t off, size_t len)
> +{
> +       unsigned int first_blk, nr_blks;
> +       unsigned long flags;
> +
> +       ifs_calc_range(folio, off, len, IOMAP_ST_DIRTY, &first_blk, &nr_blks);
> +       spin_lock_irqsave(&ifs->state_lock, flags);
> +       bitmap_clear(ifs->state, first_blk, nr_blks);
> +       spin_unlock_irqrestore(&ifs->state_lock, flags);
> +}
> +
> +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
> +{
> +       struct iomap_folio_state *ifs = folio->private;
> +
> +       if (ifs)
> +               ifs_clear_range_dirty(folio, ifs, off, len);
> +}
> +
> +static void ifs_set_range_dirty(struct folio *folio,
> +               struct iomap_folio_state *ifs, size_t off, size_t len)
> +{
> +       unsigned int first_blk, nr_blks;
> +       unsigned long flags;
> +
> +       ifs_calc_range(folio, off, len, IOMAP_ST_DIRTY, &first_blk, &nr_blks);
> +       spin_lock_irqsave(&ifs->state_lock, flags);
> +       bitmap_set(ifs->state, first_blk, nr_blks);
> +       spin_unlock_irqrestore(&ifs->state_lock, flags);
> +}
> +
> +static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
> +{
> +       struct iomap_folio_state *ifs = folio->private;
> +
> +       if (ifs)
> +               ifs_set_range_dirty(folio, ifs, off, len);
> +}
> +
>  static struct iomap_folio_state *ifs_alloc(struct inode *inode,
>                 struct folio *folio, unsigned int flags)
>  {
> @@ -93,14 +166,24 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
>         else
>                 gfp = GFP_NOFS | __GFP_NOFAIL;
>
> -       ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
> -                     gfp);
> -       if (ifs) {
> -               spin_lock_init(&ifs->state_lock);
> -               if (folio_test_uptodate(folio))
> -                       bitmap_fill(ifs->state, nr_blocks);
> -               folio_attach_private(folio, ifs);
> -       }
> +       /*
> +        * ifs->state tracks two sets of state flags when the
> +        * filesystem block size is smaller than the folio size.
> +        * The first state tracks per-block uptodate and the
> +        * second tracks per-block dirty state.
> +        */
> +       ifs = kzalloc(struct_size(ifs, state,
> +                     BITS_TO_LONGS(IOMAP_ST_MAX * nr_blocks)), gfp);
> +       if (!ifs)
> +               return ifs;
> +
> +       spin_lock_init(&ifs->state_lock);
> +       if (folio_test_uptodate(folio))
> +               bitmap_set(ifs->state, 0, nr_blocks);
> +       if (folio_test_dirty(folio))
> +               bitmap_set(ifs->state, nr_blocks, nr_blocks);
> +       folio_attach_private(folio, ifs);
> +
>         return ifs;
>  }
>
> @@ -143,7 +226,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>
>                 /* move forward for each leading block marked uptodate */
>                 for (i = first; i <= last; i++) {
> -                       if (!ifs_block_is_uptodate(ifs, i))
> +                       if (!ifs_block_is_uptodate(folio, ifs, i))
>                                 break;
>                         *pos += block_size;
>                         poff += block_size;
> @@ -153,7 +236,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>
>                 /* truncate len if we find any trailing uptodate block(s) */
>                 for ( ; i <= last; i++) {
> -                       if (ifs_block_is_uptodate(ifs, i)) {
> +                       if (ifs_block_is_uptodate(folio, ifs, i)) {
>                                 plen -= (last - i + 1) * block_size;
>                                 last = i - 1;
>                                 break;
> @@ -457,7 +540,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
>         last = (from + count - 1) >> inode->i_blkbits;
>
>         for (i = first; i <= last; i++)
> -               if (!ifs_block_is_uptodate(ifs, i))
> +               if (!ifs_block_is_uptodate(folio, ifs, i))
>                         return false;
>         return true;
>  }
> @@ -523,6 +606,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
>  }
>  EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
>
> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
> +{
> +       struct inode *inode = mapping->host;
> +       size_t len = folio_size(folio);
> +
> +       ifs_alloc(inode, folio, 0);
> +       iomap_set_range_dirty(folio, 0, len);
> +       return filemap_dirty_folio(mapping, folio);
> +}
> +EXPORT_SYMBOL_GPL(iomap_dirty_folio);
> +
>  static void
>  iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
>  {
> @@ -727,6 +821,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>         if (unlikely(copied < len && !folio_test_uptodate(folio)))
>                 return 0;
>         iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
> +       iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
>         filemap_dirty_folio(inode->i_mapping, folio);
>         return copied;
>  }
> @@ -891,6 +986,43 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
>  }
>  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
>
> +static int iomap_write_delalloc_ifs_punch(struct inode *inode,
> +               struct folio *folio, loff_t start_byte, loff_t end_byte,
> +               iomap_punch_t punch)
> +{
> +       unsigned int first_blk, last_blk, i;
> +       loff_t last_byte;
> +       u8 blkbits = inode->i_blkbits;
> +       struct iomap_folio_state *ifs;
> +       int ret = 0;
> +
> +       /*
> +        * When we have per-block dirty tracking, there can be
> +        * blocks within a folio which are marked uptodate
> +        * but not dirty. In that case it is necessary to punch
> +        * out such blocks to avoid leaking any delalloc blocks.
> +        */
> +       ifs = folio->private;
> +       if (!ifs)
> +               return ret;
> +
> +       last_byte = min_t(loff_t, end_byte - 1,
> +                       folio_pos(folio) + folio_size(folio) - 1);
> +       first_blk = offset_in_folio(folio, start_byte) >> blkbits;
> +       last_blk = offset_in_folio(folio, last_byte) >> blkbits;
> +       for (i = first_blk; i <= last_blk; i++) {
> +               if (!ifs_block_is_dirty(folio, ifs, i)) {
> +                       ret = punch(inode, folio_pos(folio) + (i << blkbits),
> +                                   1 << blkbits);
> +                       if (ret)
> +                               return ret;
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +
>  static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>                 loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
>                 iomap_punch_t punch)
> @@ -907,6 +1039,13 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>                 if (ret)
>                         return ret;
>         }
> +
> +       /* Punch non-dirty blocks within folio */
> +       ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
> +                       end_byte, punch);
> +       if (ret)
> +               return ret;
> +
>         /*
>          * Make sure the next punch start is correctly bound to
>          * the end of this data range, not the end of the folio.
> @@ -1637,7 +1776,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>                 struct writeback_control *wbc, struct inode *inode,
>                 struct folio *folio, u64 end_pos)
>  {
> -       struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
> +       struct iomap_folio_state *ifs = folio->private;
>         struct iomap_ioend *ioend, *next;
>         unsigned len = i_blocksize(inode);
>         unsigned nblocks = i_blocks_per_folio(inode, folio);
> @@ -1645,6 +1784,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>         int error = 0, count = 0, i;
>         LIST_HEAD(submit_list);
>
> +       if (!ifs && nblocks > 1) {
> +               ifs = ifs_alloc(inode, folio, 0);
> +               iomap_set_range_dirty(folio, 0, folio_size(folio));
> +       }
> +
>         WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
>
>         /*
> @@ -1653,7 +1797,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>          * invalid, grab a new one.
>          */
>         for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -               if (ifs && !ifs_block_is_uptodate(ifs, i))
> +               if (ifs && !ifs_block_is_dirty(folio, ifs, i))
>                         continue;
>
>                 error = wpc->ops->map_blocks(wpc, inode, pos);
> @@ -1697,6 +1841,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>                 }
>         }
>
> +       iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>         folio_start_writeback(folio);
>         folio_unlock(folio);
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 451942fb38ec..2fca4b4e7fd8 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
>         .read_folio             = xfs_vm_read_folio,
>         .readahead              = xfs_vm_readahead,
>         .writepages             = xfs_vm_writepages,
> -       .dirty_folio            = filemap_dirty_folio,
> +       .dirty_folio            = iomap_dirty_folio,
>         .release_folio          = iomap_release_folio,
>         .invalidate_folio       = iomap_invalidate_folio,
>         .bmap                   = xfs_vm_bmap,
> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
> index 132f01d3461f..e508c8e97372 100644
> --- a/fs/zonefs/file.c
> +++ b/fs/zonefs/file.c
> @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
>         .read_folio             = zonefs_read_folio,
>         .readahead              = zonefs_readahead,
>         .writepages             = zonefs_writepages,
> -       .dirty_folio            = filemap_dirty_folio,
> +       .dirty_folio            = iomap_dirty_folio,
>         .release_folio          = iomap_release_folio,
>         .invalidate_folio       = iomap_invalidate_folio,
>         .migrate_folio          = filemap_migrate_folio,
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index e2b836c2e119..eb9335c46bf3 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
>  struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
>  bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
>  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
>  int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
>                 const struct iomap_ops *ops);
>  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
> --
> 2.40.1
>

Thanks,
Andreas
Matthew Wilcox (Oracle) June 19, 2023, 2:26 p.m. UTC | #2
On Mon, Jun 19, 2023 at 07:58:51AM +0530, Ritesh Harjani (IBM) wrote:
> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
> +		enum iomap_block_state state, unsigned int *first_blkp,
> +		unsigned int *nr_blksp)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first = off >> inode->i_blkbits;
> +	unsigned int last = (off + len - 1) >> inode->i_blkbits;
> +
> +	*first_blkp = first + (state * blks_per_folio);
> +	*nr_blksp = last - first + 1;
> +}

As I said, this is not 'first_blkp'.  It's first_bitp.  I think this
misunderstanding is related to Andreas' complaint, but it's not quite
the same.

>  static inline bool ifs_is_fully_uptodate(struct folio *folio,
>  					       struct iomap_folio_state *ifs)
>  {
>  	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int nr_blks = (IOMAP_ST_UPTODATE + 1) * blks_per_folio;
>  
> -	return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
> +	return bitmap_full(ifs->state, nr_blks);

I think we have a gap in our bitmap APIs.  We don't have a
'bitmap_range_full(src, pos, nbits)'.  We could use find_next_zero_bit(),
but that's going to do more work than necessary.

Given this lack, perhaps it's time to say that you're making all of
this too hard by using an enum, and pretending that we can switch the
positions of 'uptodate' and 'dirty' in the bitmap just by changing
the enum.  Define the uptodate bits to be the first ones in the bitmap,
document it (and why), and leave it at that.
Ritesh Harjani (IBM) June 19, 2023, 4:25 p.m. UTC | #3
Matthew Wilcox <willy@infradead.org> writes:

> On Mon, Jun 19, 2023 at 07:58:51AM +0530, Ritesh Harjani (IBM) wrote:
>> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
>> +		enum iomap_block_state state, unsigned int *first_blkp,
>> +		unsigned int *nr_blksp)
>> +{
>> +	struct inode *inode = folio->mapping->host;
>> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +	unsigned int first = off >> inode->i_blkbits;
>> +	unsigned int last = (off + len - 1) >> inode->i_blkbits;
>> +
>> +	*first_blkp = first + (state * blks_per_folio);
>> +	*nr_blksp = last - first + 1;
>> +}
>
> As I said, this is not 'first_blkp'.  It's first_bitp.  I think this
> misunderstanding is related to Andreas' complaint, but it's not quite
> the same.
>

We represent each FS block as a bit in the bitmap. So first_blkp or
first_bitp or first_blkbitp essentially means the same. 
I went with first_blk, first_blkp in the first place based on your
suggestion itself [1].

[1]: https://lore.kernel.org/linux-xfs/Y%2FvxlVUJ31PZYaRa@casper.infradead.org/

>>  static inline bool ifs_is_fully_uptodate(struct folio *folio,
>>  					       struct iomap_folio_state *ifs)
>>  {
>>  	struct inode *inode = folio->mapping->host;
>> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +	unsigned int nr_blks = (IOMAP_ST_UPTODATE + 1) * blks_per_folio;
>>  
>> -	return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
>> +	return bitmap_full(ifs->state, nr_blks);
>
> I think we have a gap in our bitmap APIs.  We don't have a
> 'bitmap_range_full(src, pos, nbits)'.  We could use find_next_zero_bit(),
> but that's going to do more work than necessary.
>
> Given this lack, perhaps it's time to say that you're making all of
> this too hard by using an enum, and pretending that we can switch the
> positions of 'uptodate' and 'dirty' in the bitmap just by changing
> the enum.

Actually I never wanted to use the the enum this way. That's why I was
not fond of the idea behind using enum in all the bitmap state
manipulation APIs (test/set/).

It was only intended to be passed as a state argument to ifs_calc_range()
function to keep all the first_blkp and nr_blksp calculation at one
place. And just use it's IOMAP_ST_MAX value while allocating state bitmap.
It was never intended to be used like this.

We can even now go back to this original idea and keep the use of the
enum limited to what I just mentioned above i.e. for ifs_calc_range().

And maybe just use this in ifs_alloc()?
BUILD_BUG_ON(IOMAP_ST_UPTODATE == 0);
BUILD_BUG_ON(IOMAP_ST_DIRTY == 1);

> Define the uptodate bits to be the first ones in the bitmap,
> document it (and why), and leave it at that.

Do you think we can go with above suggestion, or do you still think we
need to drop it?

In case if we drop it, then should we open code the calculations for
first_blk, last_blk? These calculations are done in exact same fashion
at 3 places ifs_set_range_uptodate(), ifs_clear_range_dirty() and
ifs_set_range_dirty().
Thoughts?

-ritesh
Matthew Wilcox (Oracle) June 19, 2023, 4:54 p.m. UTC | #4
On Mon, Jun 19, 2023 at 09:55:53PM +0530, Ritesh Harjani wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
> > On Mon, Jun 19, 2023 at 07:58:51AM +0530, Ritesh Harjani (IBM) wrote:
> >> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
> >> +		enum iomap_block_state state, unsigned int *first_blkp,
> >> +		unsigned int *nr_blksp)
> >> +{
> >> +	struct inode *inode = folio->mapping->host;
> >> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> >> +	unsigned int first = off >> inode->i_blkbits;
> >> +	unsigned int last = (off + len - 1) >> inode->i_blkbits;
> >> +
> >> +	*first_blkp = first + (state * blks_per_folio);
> >> +	*nr_blksp = last - first + 1;
> >> +}
> >
> > As I said, this is not 'first_blkp'.  It's first_bitp.  I think this
> > misunderstanding is related to Andreas' complaint, but it's not quite
> > the same.
> >
> 
> We represent each FS block as a bit in the bitmap. So first_blkp or
> first_bitp or first_blkbitp essentially means the same. 
> I went with first_blk, first_blkp in the first place based on your
> suggestion itself [1].

No, it's not the same!  If you have 1kB blocks in a 64kB page, they're
numbered 0-63.  If you 'calc_range' for any of the dirty bits, you get
back a number in the range 64-127.  That's not a block number!  It's
the number of the bit you want to refer to.  Calling it blkp is going
to lead to confusion -- as you yourself seem to be confused.

> [1]: https://lore.kernel.org/linux-xfs/Y%2FvxlVUJ31PZYaRa@casper.infradead.org/

Those _were_ block numbers!  off >> inode->i_blkbits calculates a block
number.  (off >> inode->i_blkbits) + blocks_per_folio() does not calculate
a block number, it calculates a bit number.

> >> -	return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
> >> +	return bitmap_full(ifs->state, nr_blks);
> >
> > I think we have a gap in our bitmap APIs.  We don't have a
> > 'bitmap_range_full(src, pos, nbits)'.  We could use find_next_zero_bit(),
> > but that's going to do more work than necessary.
> >
> > Given this lack, perhaps it's time to say that you're making all of
> > this too hard by using an enum, and pretending that we can switch the
> > positions of 'uptodate' and 'dirty' in the bitmap just by changing
> > the enum.
> 
> Actually I never wanted to use the the enum this way. That's why I was
> not fond of the idea behind using enum in all the bitmap state
> manipulation APIs (test/set/).
> 
> It was only intended to be passed as a state argument to ifs_calc_range()
> function to keep all the first_blkp and nr_blksp calculation at one
> place. And just use it's IOMAP_ST_MAX value while allocating state bitmap.
> It was never intended to be used like this.
> 
> We can even now go back to this original idea and keep the use of the
> enum limited to what I just mentioned above i.e. for ifs_calc_range().
> 
> And maybe just use this in ifs_alloc()?
> BUILD_BUG_ON(IOMAP_ST_UPTODATE == 0);
> BUILD_BUG_ON(IOMAP_ST_DIRTY == 1);
> 
> > Define the uptodate bits to be the first ones in the bitmap,
> > document it (and why), and leave it at that.
> 
> Do you think we can go with above suggestion, or do you still think we
> need to drop it?
> 
> In case if we drop it, then should we open code the calculations for
> first_blk, last_blk? These calculations are done in exact same fashion
> at 3 places ifs_set_range_uptodate(), ifs_clear_range_dirty() and
> ifs_set_range_dirty().
> Thoughts?

I disliked the enum from the moment I saw it, but didn't care enough to
say so.

Look, an abstraction should have a _purpose_.  The enum doesn't.  I'd
ditch this calc_range function entirely; it's just not worth it.
Ritesh Harjani (IBM) June 19, 2023, 4:55 p.m. UTC | #5
Andreas Gruenbacher <agruenba@redhat.com> writes:

> On Mon, Jun 19, 2023 at 4:29 AM Ritesh Harjani (IBM)
> <ritesh.list@gmail.com> wrote:
>> When filesystem blocksize is less than folio size (either with
>> mapping_large_folio_support() or with blocksize < pagesize) and when the
>> folio is uptodate in pagecache, then even a byte write can cause
>> an entire folio to be written to disk during writeback. This happens
>> because we currently don't have a mechanism to track per-block dirty
>> state within struct iomap_folio_state. We currently only track uptodate
>> state.
>>
>> This patch implements support for tracking per-block dirty state in
>> iomap_folio_state->state bitmap. This should help improve the filesystem
>> write performance and help reduce write amplification.
>>
>> Performance testing of below fio workload reveals ~16x performance
>> improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
>> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
>>
>> 1. <test_randwrite.fio>
>> [global]
>>         ioengine=psync
>>         rw=randwrite
>>         overwrite=1
>>         pre_read=1
>>         direct=0
>>         bs=4k
>>         size=1G
>>         dir=./
>>         numjobs=8
>>         fdatasync=1
>>         runtime=60
>>         iodepth=64
>>         group_reporting=1
>>
>> [fio-run]
>>
>> 2. Also our internal performance team reported that this patch improves
>>    their database workload performance by around ~83% (with XFS on Power)
>>
>> Reported-by: Aravinda Herle <araherle@in.ibm.com>
>> Reported-by: Brian Foster <bfoster@redhat.com>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> ---
>>  fs/gfs2/aops.c         |   2 +-
>>  fs/iomap/buffered-io.c | 189 ++++++++++++++++++++++++++++++++++++-----
>>  fs/xfs/xfs_aops.c      |   2 +-
>>  fs/zonefs/file.c       |   2 +-
>>  include/linux/iomap.h  |   1 +
>>  5 files changed, 171 insertions(+), 25 deletions(-)
>>
>> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
>> index a5f4be6b9213..75efec3c3b71 100644
>> --- a/fs/gfs2/aops.c
>> +++ b/fs/gfs2/aops.c
>> @@ -746,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
>>         .writepages = gfs2_writepages,
>>         .read_folio = gfs2_read_folio,
>>         .readahead = gfs2_readahead,
>> -       .dirty_folio = filemap_dirty_folio,
>> +       .dirty_folio = iomap_dirty_folio,
>>         .release_folio = iomap_release_folio,
>>         .invalidate_folio = iomap_invalidate_folio,
>>         .bmap = gfs2_bmap,
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 391d918ddd22..50f5840bb5f9 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -25,7 +25,7 @@
>>
>>  typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>>  /*
>> - * Structure allocated for each folio to track per-block uptodate state
>> + * Structure allocated for each folio to track per-block uptodate, dirty state
>>   * and I/O completions.
>>   */
>>  struct iomap_folio_state {
>> @@ -35,31 +35,55 @@ struct iomap_folio_state {
>>         unsigned long           state[];
>>  };
>>
>> +enum iomap_block_state {
>> +       IOMAP_ST_UPTODATE,
>> +       IOMAP_ST_DIRTY,
>> +
>> +       IOMAP_ST_MAX,
>> +};
>> +
>> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
>> +               enum iomap_block_state state, unsigned int *first_blkp,
>> +               unsigned int *nr_blksp)
>> +{
>> +       struct inode *inode = folio->mapping->host;
>> +       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +       unsigned int first = off >> inode->i_blkbits;
>> +       unsigned int last = (off + len - 1) >> inode->i_blkbits;
>> +
>> +       *first_blkp = first + (state * blks_per_folio);
>> +       *nr_blksp = last - first + 1;
>> +}
>> +
>>  static struct bio_set iomap_ioend_bioset;
>>
>>  static inline bool ifs_is_fully_uptodate(struct folio *folio,
>>                                                struct iomap_folio_state *ifs)
>>  {
>>         struct inode *inode = folio->mapping->host;
>> +       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +       unsigned int nr_blks = (IOMAP_ST_UPTODATE + 1) * blks_per_folio;
>
> This nr_blks calculation doesn't make sense.
>

About this, I have replied with more details here [1]

[1]: https://lore.kernel.org/linux-xfs/87o7lbmnam.fsf@doe.com/


>> -       return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
>> +       return bitmap_full(ifs->state, nr_blks);
>
> Could you please change this to:
>
> BUILD_BUG_ON(IOMAP_ST_UPTODATE != 0);

ditto

> return bitmap_full(ifs->state, blks_per_folio);
>
> Also, I'm seeing that the value of i_blocks_per_folio() is assigned to
> local variables with various names in several places (blks_per_folio,
> nr_blocks, nblocks). Maybe this could be made consistent.
>

I remember giving it a try, but then it was really not worth it because
all of above naming also does make sense in the way they are getting
used currently in the code. So, I think as long as it is clear behind
what those variables means and how are they getting used, it should be
ok, IMO. 

-ritesh
Ritesh Harjani (IBM) June 19, 2023, 5:29 p.m. UTC | #6
Matthew Wilcox <willy@infradead.org> writes:

> On Mon, Jun 19, 2023 at 09:55:53PM +0530, Ritesh Harjani wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>> 
>> > On Mon, Jun 19, 2023 at 07:58:51AM +0530, Ritesh Harjani (IBM) wrote:
>> >> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
>> >> +		enum iomap_block_state state, unsigned int *first_blkp,
>> >> +		unsigned int *nr_blksp)
>> >> +{
>> >> +	struct inode *inode = folio->mapping->host;
>> >> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> >> +	unsigned int first = off >> inode->i_blkbits;
>> >> +	unsigned int last = (off + len - 1) >> inode->i_blkbits;
>> >> +
>> >> +	*first_blkp = first + (state * blks_per_folio);
>> >> +	*nr_blksp = last - first + 1;
>> >> +}
>> >
>> > As I said, this is not 'first_blkp'.  It's first_bitp.  I think this
>> > misunderstanding is related to Andreas' complaint, but it's not quite
>> > the same.
>> >
>> 
>> We represent each FS block as a bit in the bitmap. So first_blkp or
>> first_bitp or first_blkbitp essentially means the same. 
>> I went with first_blk, first_blkp in the first place based on your
>> suggestion itself [1].
>
> No, it's not the same!  If you have 1kB blocks in a 64kB page, they're
> numbered 0-63.  If you 'calc_range' for any of the dirty bits, you get
> back a number in the range 64-127.  That's not a block number!  It's
> the number of the bit you want to refer to.  Calling it blkp is going
> to lead to confusion -- as you yourself seem to be confused.
>
>> [1]: https://lore.kernel.org/linux-xfs/Y%2FvxlVUJ31PZYaRa@casper.infradead.org/
>
> Those _were_ block numbers!  off >> inode->i_blkbits calculates a block
> number.  (off >> inode->i_blkbits) + blocks_per_folio() does not calculate
> a block number, it calculates a bit number.
>

Yes, I don't mind changing it to _bit. It is derived out of an FS block
representation only. But I agree with your above argument using _bit in
variable name makes it explicit and clear.

>> >> -	return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
>> >> +	return bitmap_full(ifs->state, nr_blks);
>> >
>> > I think we have a gap in our bitmap APIs.  We don't have a
>> > 'bitmap_range_full(src, pos, nbits)'.  We could use find_next_zero_bit(),
>> > but that's going to do more work than necessary.
>> >
>> > Given this lack, perhaps it's time to say that you're making all of
>> > this too hard by using an enum, and pretending that we can switch the
>> > positions of 'uptodate' and 'dirty' in the bitmap just by changing
>> > the enum.
>> 
>> Actually I never wanted to use the the enum this way. That's why I was
>> not fond of the idea behind using enum in all the bitmap state
>> manipulation APIs (test/set/).
>> 
>> It was only intended to be passed as a state argument to ifs_calc_range()
>> function to keep all the first_blkp and nr_blksp calculation at one
>> place. And just use it's IOMAP_ST_MAX value while allocating state bitmap.
>> It was never intended to be used like this.
>> 
>> We can even now go back to this original idea and keep the use of the
>> enum limited to what I just mentioned above i.e. for ifs_calc_range().
>> 
>> And maybe just use this in ifs_alloc()?
>> BUILD_BUG_ON(IOMAP_ST_UPTODATE == 0);
>> BUILD_BUG_ON(IOMAP_ST_DIRTY == 1);
>> 
>> > Define the uptodate bits to be the first ones in the bitmap,
>> > document it (and why), and leave it at that.
>> 
>> Do you think we can go with above suggestion, or do you still think we
>> need to drop it?
>> 
>> In case if we drop it, then should we open code the calculations for
>> first_blk, last_blk? These calculations are done in exact same fashion
>> at 3 places ifs_set_range_uptodate(), ifs_clear_range_dirty() and
>> ifs_set_range_dirty().
>> Thoughts?
>
> I disliked the enum from the moment I saw it, but didn't care enough to
> say so.
>
> Look, an abstraction should have a _purpose_.  The enum doesn't.  I'd
> ditch this calc_range function entirely; it's just not worth it.

I guess enum is creating more confusion with almost everyone than adding value.
So I don't mind ditching it (unless anyone else opposes for keeping it).

Also it would be helpful if you could let me know of any other review
comments on the rest of the patch? Does the rest looks good to you?

-ritesh
Darrick J. Wong June 20, 2023, 5:01 a.m. UTC | #7
On Mon, Jun 19, 2023 at 10:59:09PM +0530, Ritesh Harjani wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
> > On Mon, Jun 19, 2023 at 09:55:53PM +0530, Ritesh Harjani wrote:
> >> Matthew Wilcox <willy@infradead.org> writes:
> >> 
> >> > On Mon, Jun 19, 2023 at 07:58:51AM +0530, Ritesh Harjani (IBM) wrote:
> >> >> +static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
> >> >> +		enum iomap_block_state state, unsigned int *first_blkp,
> >> >> +		unsigned int *nr_blksp)
> >> >> +{
> >> >> +	struct inode *inode = folio->mapping->host;
> >> >> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> >> >> +	unsigned int first = off >> inode->i_blkbits;
> >> >> +	unsigned int last = (off + len - 1) >> inode->i_blkbits;
> >> >> +
> >> >> +	*first_blkp = first + (state * blks_per_folio);
> >> >> +	*nr_blksp = last - first + 1;
> >> >> +}
> >> >
> >> > As I said, this is not 'first_blkp'.  It's first_bitp.  I think this
> >> > misunderstanding is related to Andreas' complaint, but it's not quite
> >> > the same.
> >> >
> >> 
> >> We represent each FS block as a bit in the bitmap. So first_blkp or
> >> first_bitp or first_blkbitp essentially means the same. 
> >> I went with first_blk, first_blkp in the first place based on your
> >> suggestion itself [1].
> >
> > No, it's not the same!  If you have 1kB blocks in a 64kB page, they're
> > numbered 0-63.  If you 'calc_range' for any of the dirty bits, you get
> > back a number in the range 64-127.  That's not a block number!  It's
> > the number of the bit you want to refer to.  Calling it blkp is going
> > to lead to confusion -- as you yourself seem to be confused.
> >
> >> [1]: https://lore.kernel.org/linux-xfs/Y%2FvxlVUJ31PZYaRa@casper.infradead.org/
> >
> > Those _were_ block numbers!  off >> inode->i_blkbits calculates a block
> > number.  (off >> inode->i_blkbits) + blocks_per_folio() does not calculate
> > a block number, it calculates a bit number.
> >
> 
> Yes, I don't mind changing it to _bit. It is derived out of an FS block
> representation only. But I agree with your above argument using _bit in
> variable name makes it explicit and clear.
> 
> >> >> -	return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
> >> >> +	return bitmap_full(ifs->state, nr_blks);
> >> >
> >> > I think we have a gap in our bitmap APIs.  We don't have a
> >> > 'bitmap_range_full(src, pos, nbits)'.  We could use find_next_zero_bit(),
> >> > but that's going to do more work than necessary.
> >> >
> >> > Given this lack, perhaps it's time to say that you're making all of
> >> > this too hard by using an enum, and pretending that we can switch the
> >> > positions of 'uptodate' and 'dirty' in the bitmap just by changing
> >> > the enum.
> >> 
> >> Actually I never wanted to use the the enum this way. That's why I was
> >> not fond of the idea behind using enum in all the bitmap state
> >> manipulation APIs (test/set/).
> >> 
> >> It was only intended to be passed as a state argument to ifs_calc_range()
> >> function to keep all the first_blkp and nr_blksp calculation at one
> >> place. And just use it's IOMAP_ST_MAX value while allocating state bitmap.
> >> It was never intended to be used like this.
> >> 
> >> We can even now go back to this original idea and keep the use of the
> >> enum limited to what I just mentioned above i.e. for ifs_calc_range().
> >> 
> >> And maybe just use this in ifs_alloc()?
> >> BUILD_BUG_ON(IOMAP_ST_UPTODATE == 0);
> >> BUILD_BUG_ON(IOMAP_ST_DIRTY == 1);
> >> 
> >> > Define the uptodate bits to be the first ones in the bitmap,
> >> > document it (and why), and leave it at that.
> >> 
> >> Do you think we can go with above suggestion, or do you still think we
> >> need to drop it?
> >> 
> >> In case if we drop it, then should we open code the calculations for
> >> first_blk, last_blk? These calculations are done in exact same fashion
> >> at 3 places ifs_set_range_uptodate(), ifs_clear_range_dirty() and
> >> ifs_set_range_dirty().
> >> Thoughts?
> >
> > I disliked the enum from the moment I saw it, but didn't care enough to
> > say so.
> >
> > Look, an abstraction should have a _purpose_.  The enum doesn't.  I'd
> > ditch this calc_range function entirely; it's just not worth it.
> 
> I guess enum is creating more confusion with almost everyone than adding value.
> So I don't mind ditching it (unless anyone else opposes for keeping it).
> 
> Also it would be helpful if you could let me know of any other review
> comments on the rest of the patch? Does the rest looks good to you?

I deleted my entire angry rant about how this review has turned a
fairly simple design change into a big mess that even the reviewers
don't understand anymore.  I'm on vacation, I DGAF anymore.

Ritesh: Dump the enum; "because btrfs does it" is not sufficient
justification.  The rest is good enough, I'll put it in iomap-for-next
along with willy's thing as soon as 6.5-rc1 closes, and if you all have
further complaints, send your own patches.

--D

> -ritesh
Ritesh Harjani (IBM) June 20, 2023, 5:58 a.m. UTC | #8
"Darrick J. Wong" <djwong@kernel.org> writes:

> Ritesh: Dump the enum; "because btrfs does it" is not sufficient
> justification.  The rest is good enough, I'll put it in iomap-for-next
> along with willy's thing as soon as 6.5-rc1 closes, and if you all have
> further complaints, send your own patches.

Sure, Darrick. I agree that enum created more confusion than help (sorry
aobut that). I will dump it in the next revision & address Matthew's
review comment on the variable naming.

I agree to lock the current set of changes as functionality wise those
are looking good from everyone. Any further optimizations/improvements
can be handled seperately so that we don't complicate this one.  

Thanks once again Darrick and all the other reviewers!!

-ritesh

I am pulled into something else for next couple of days. Post that I
will work on it to get next revision out.
diff mbox series

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a5f4be6b9213..75efec3c3b71 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -746,7 +746,7 @@  static const struct address_space_operations gfs2_aops = {
 	.writepages = gfs2_writepages,
 	.read_folio = gfs2_read_folio,
 	.readahead = gfs2_readahead,
-	.dirty_folio = filemap_dirty_folio,
+	.dirty_folio = iomap_dirty_folio,
 	.release_folio = iomap_release_folio,
 	.invalidate_folio = iomap_invalidate_folio,
 	.bmap = gfs2_bmap,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 391d918ddd22..50f5840bb5f9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -25,7 +25,7 @@ 
 
 typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
 /*
- * Structure allocated for each folio to track per-block uptodate state
+ * Structure allocated for each folio to track per-block uptodate, dirty state
  * and I/O completions.
  */
 struct iomap_folio_state {
@@ -35,31 +35,55 @@  struct iomap_folio_state {
 	unsigned long		state[];
 };
 
+enum iomap_block_state {
+	IOMAP_ST_UPTODATE,
+	IOMAP_ST_DIRTY,
+
+	IOMAP_ST_MAX,
+};
+
+static void ifs_calc_range(struct folio *folio, size_t off, size_t len,
+		enum iomap_block_state state, unsigned int *first_blkp,
+		unsigned int *nr_blksp)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first = off >> inode->i_blkbits;
+	unsigned int last = (off + len - 1) >> inode->i_blkbits;
+
+	*first_blkp = first + (state * blks_per_folio);
+	*nr_blksp = last - first + 1;
+}
+
 static struct bio_set iomap_ioend_bioset;
 
 static inline bool ifs_is_fully_uptodate(struct folio *folio,
 					       struct iomap_folio_state *ifs)
 {
 	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int nr_blks = (IOMAP_ST_UPTODATE + 1) * blks_per_folio;
 
-	return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
+	return bitmap_full(ifs->state, nr_blks);
 }
 
-static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
-					       unsigned int block)
+static inline bool ifs_block_is_uptodate(struct folio *folio,
+		struct iomap_folio_state *ifs, unsigned int block)
 {
-	return test_bit(block, ifs->state);
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + IOMAP_ST_UPTODATE * blks_per_folio, ifs->state);
 }
 
 static void ifs_set_range_uptodate(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
-	struct inode *inode = folio->mapping->host;
-	unsigned int first_blk = off >> inode->i_blkbits;
-	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
-	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned int first_blk, nr_blks;
 	unsigned long flags;
 
+	ifs_calc_range(folio, off, len, IOMAP_ST_UPTODATE, &first_blk,
+			     &nr_blks);
 	spin_lock_irqsave(&ifs->state_lock, flags);
 	bitmap_set(ifs->state, first_blk, nr_blks);
 	if (ifs_is_fully_uptodate(folio, ifs))
@@ -78,6 +102,55 @@  static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 		folio_mark_uptodate(folio);
 }
 
+static inline bool ifs_block_is_dirty(struct folio *folio,
+		struct iomap_folio_state *ifs, int block)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + IOMAP_ST_DIRTY * blks_per_folio, ifs->state);
+}
+
+static void ifs_clear_range_dirty(struct folio *folio,
+		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	unsigned int first_blk, nr_blks;
+	unsigned long flags;
+
+	ifs_calc_range(folio, off, len, IOMAP_ST_DIRTY, &first_blk, &nr_blks);
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_clear(ifs->state, first_blk, nr_blks);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs)
+		ifs_clear_range_dirty(folio, ifs, off, len);
+}
+
+static void ifs_set_range_dirty(struct folio *folio,
+		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	unsigned int first_blk, nr_blks;
+	unsigned long flags;
+
+	ifs_calc_range(folio, off, len, IOMAP_ST_DIRTY, &first_blk, &nr_blks);
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_set(ifs->state, first_blk, nr_blks);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs)
+		ifs_set_range_dirty(folio, ifs, off, len);
+}
+
 static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 		struct folio *folio, unsigned int flags)
 {
@@ -93,14 +166,24 @@  static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;
 
-	ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
-		      gfp);
-	if (ifs) {
-		spin_lock_init(&ifs->state_lock);
-		if (folio_test_uptodate(folio))
-			bitmap_fill(ifs->state, nr_blocks);
-		folio_attach_private(folio, ifs);
-	}
+	/*
+	 * ifs->state tracks two sets of state flags when the
+	 * filesystem block size is smaller than the folio size.
+	 * The first state tracks per-block uptodate and the
+	 * second tracks per-block dirty state.
+	 */
+	ifs = kzalloc(struct_size(ifs, state,
+		      BITS_TO_LONGS(IOMAP_ST_MAX * nr_blocks)), gfp);
+	if (!ifs)
+		return ifs;
+
+	spin_lock_init(&ifs->state_lock);
+	if (folio_test_uptodate(folio))
+		bitmap_set(ifs->state, 0, nr_blocks);
+	if (folio_test_dirty(folio))
+		bitmap_set(ifs->state, nr_blocks, nr_blocks);
+	folio_attach_private(folio, ifs);
+
 	return ifs;
 }
 
@@ -143,7 +226,7 @@  static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
 
 		/* move forward for each leading block marked uptodate */
 		for (i = first; i <= last; i++) {
-			if (!ifs_block_is_uptodate(ifs, i))
+			if (!ifs_block_is_uptodate(folio, ifs, i))
 				break;
 			*pos += block_size;
 			poff += block_size;
@@ -153,7 +236,7 @@  static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
 
 		/* truncate len if we find any trailing uptodate block(s) */
 		for ( ; i <= last; i++) {
-			if (ifs_block_is_uptodate(ifs, i)) {
+			if (ifs_block_is_uptodate(folio, ifs, i)) {
 				plen -= (last - i + 1) * block_size;
 				last = i - 1;
 				break;
@@ -457,7 +540,7 @@  bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
 	last = (from + count - 1) >> inode->i_blkbits;
 
 	for (i = first; i <= last; i++)
-		if (!ifs_block_is_uptodate(ifs, i))
+		if (!ifs_block_is_uptodate(folio, ifs, i))
 			return false;
 	return true;
 }
@@ -523,6 +606,17 @@  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct inode *inode = mapping->host;
+	size_t len = folio_size(folio);
+
+	ifs_alloc(inode, folio, 0);
+	iomap_set_range_dirty(folio, 0, len);
+	return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -727,6 +821,7 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return 0;
 	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
+	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -891,6 +986,43 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+		struct folio *folio, loff_t start_byte, loff_t end_byte,
+		iomap_punch_t punch)
+{
+	unsigned int first_blk, last_blk, i;
+	loff_t last_byte;
+	u8 blkbits = inode->i_blkbits;
+	struct iomap_folio_state *ifs;
+	int ret = 0;
+
+	/*
+	 * When we have per-block dirty tracking, there can be
+	 * blocks within a folio which are marked uptodate
+	 * but not dirty. In that case it is necessary to punch
+	 * out such blocks to avoid leaking any delalloc blocks.
+	 */
+	ifs = folio->private;
+	if (!ifs)
+		return ret;
+
+	last_byte = min_t(loff_t, end_byte - 1,
+			folio_pos(folio) + folio_size(folio) - 1);
+	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+	for (i = first_blk; i <= last_blk; i++) {
+		if (!ifs_block_is_dirty(folio, ifs, i)) {
+			ret = punch(inode, folio_pos(folio) + (i << blkbits),
+				    1 << blkbits);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+
 static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
 		iomap_punch_t punch)
@@ -907,6 +1039,13 @@  static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 		if (ret)
 			return ret;
 	}
+
+	/* Punch non-dirty blocks within folio */
+	ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
+			end_byte, punch);
+	if (ret)
+		return ret;
+
 	/*
 	 * Make sure the next punch start is correctly bound to
 	 * the end of this data range, not the end of the folio.
@@ -1637,7 +1776,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct inode *inode,
 		struct folio *folio, u64 end_pos)
 {
-	struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
+	struct iomap_folio_state *ifs = folio->private;
 	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1645,6 +1784,11 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	int error = 0, count = 0, i;
 	LIST_HEAD(submit_list);
 
+	if (!ifs && nblocks > 1) {
+		ifs = ifs_alloc(inode, folio, 0);
+		iomap_set_range_dirty(folio, 0, folio_size(folio));
+	}
+
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
 
 	/*
@@ -1653,7 +1797,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (ifs && !ifs_block_is_uptodate(ifs, i))
+		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
 			continue;
 
 		error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1697,6 +1841,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 	}
 
+	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 451942fb38ec..2fca4b4e7fd8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -578,7 +578,7 @@  const struct address_space_operations xfs_address_space_operations = {
 	.read_folio		= xfs_vm_read_folio,
 	.readahead		= xfs_vm_readahead,
 	.writepages		= xfs_vm_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.bmap			= xfs_vm_bmap,
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 132f01d3461f..e508c8e97372 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -175,7 +175,7 @@  const struct address_space_operations zonefs_file_aops = {
 	.read_folio		= zonefs_read_folio,
 	.readahead		= zonefs_readahead,
 	.writepages		= zonefs_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.migrate_folio		= filemap_migrate_folio,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..eb9335c46bf3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -264,6 +264,7 @@  bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,