diff mbox series

[RFC,2/2] iomap: Support subpage size dirty tracking to improve write performance

Message ID 886076cfa6f547d22765c522177d33cf621013d2.1666928993.git.ritesh.list@gmail.com (mailing list archive)
State New, archived
Headers show
Series iomap: Add support for subpage dirty state tracking to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) Oct. 28, 2022, 4:30 a.m. UTC
On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
filesystem blocksize, this patch should improve the performance by doing
only the subpage dirty data write.

This should also reduce the write amplification since we can now track
subpage dirty status within state bitmaps. Earlier we had to
write the entire 64k page even if only a part of it (e.g. 4k) was
updated.

Performance testing of below fio workload reveals ~16x performance
improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

<test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/iomap/buffered-io.c | 53 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

Comments

Matthew Wilcox (Oracle) Oct. 28, 2022, 12:42 p.m. UTC | #1
On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> @@ -1354,7 +1399,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (iop && !test_bit(i, iop->state))
> +		if (iop && (!test_bit(i, iop->state) ||
> +			    !test_bit(i + nblocks, iop->state)))
>  			continue;
>  
>  		error = wpc->ops->map_blocks(wpc, inode, pos);

Why do we need to test both uptodate and dirty?  Surely we only need to
test the dirty bit?  How can a !uptodate block ever be marked as dirty?

More generally, I think open-coding this is going to lead to confusion.
We need wrappers like 'iop_block_dirty()' and 'iop_block_uptodate()'.
(iop is still a bad name for this, but nobody's stepped up with a better
one yet).
Darrick J. Wong Oct. 28, 2022, 5:01 p.m. UTC | #2
On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> filesystem blocksize, this patch should improve the performance by doing
> only the subpage dirty data write.
> 
> This should also reduce the write amplification since we can now track
> subpage dirty status within state bitmaps. Earlier we had to
> write the entire 64k page even if only a part of it (e.g. 4k) was
> updated.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/iomap/buffered-io.c | 53 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 51 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 255f9f92668c..31ee80a996b2 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -58,7 +58,7 @@ iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
>  	else
>  		gfp = GFP_NOFS | __GFP_NOFAIL;
>  
> -	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
> +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
>  		      gfp);
>  	if (iop) {
>  		spin_lock_init(&iop->state_lock);
> @@ -168,6 +168,48 @@ static void iomap_set_range_uptodate(struct folio *folio,
>  		folio_mark_uptodate(folio);
>  }
>  
> +static void iomap_iop_set_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> +	unsigned first = (off >> inode->i_blkbits) + nr_blocks;
> +	unsigned last = ((off + len - 1) >> inode->i_blkbits) + nr_blocks;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	bitmap_set(iop->state, first, last - first + 1);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_set_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	if (iop)
> +		iomap_iop_set_range_dirty(folio, iop, off, len);
> +}
> +
> +static void iomap_iop_clear_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> +	unsigned first = (off >> inode->i_blkbits) + nr_blocks;
> +	unsigned last = ((off + len - 1) >> inode->i_blkbits) + nr_blocks;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	bitmap_clear(iop->state, first, last - first + 1);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_clear_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	if (iop)
> +		iomap_iop_clear_range_dirty(folio, iop, off, len);
> +}
> +
>  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>  		size_t len, int error)
>  {
> @@ -665,6 +707,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
>  		return 0;
>  	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
> +	iomap_set_range_dirty(folio, iop, offset_in_folio(folio, pos), len);
>  	filemap_dirty_folio(inode->i_mapping, folio);
>  	return copied;
>  }
> @@ -979,6 +1022,8 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
>  		block_commit_write(&folio->page, 0, length);
>  	} else {
>  		WARN_ON_ONCE(!folio_test_uptodate(folio));
> +		iomap_set_range_dirty(folio, to_iomap_page(folio),
> +				offset_in_folio(folio, iter->pos), length);
>  		folio_mark_dirty(folio);
>  	}
>  
> @@ -1354,7 +1399,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (iop && !test_bit(i, iop->state))
> +		if (iop && (!test_bit(i, iop->state) ||
> +			    !test_bit(i + nblocks, iop->state)))

Hmm.  So I /think/ these two test_bit()s mean that we skip any folio
sub-block if it's either notuptodate or not dirty?

I /think/ we only need to check the dirty status, right?  Like willy
said? :)

That said... somewhere we probably ought to check the consistency of the
two bits to ensure that they're not (dirty && !uptodate), given our
horrible history of getting things wrong with page and bufferhead state
bits.

Admittedly I'm not thrilled at the reintroduction of page and iop dirty
state that are updated in separate places, but OTOH the write
amplification here is demonstrably horrifying as you point out so it's
clearly necessary.

Maybe we need a debugging function that will check the page and iop
state, and call it every time we go in and out of critical iomap
functions (write, writeback, dropping pages, etc)

--D

>  			continue;
>  
>  		error = wpc->ops->map_blocks(wpc, inode, pos);
> @@ -1397,6 +1443,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		}
>  	}
>  
> +	iomap_clear_range_dirty(folio, iop,
> +				offset_in_folio(folio, folio_pos(folio)),
> +				end_pos - folio_pos(folio));
>  	folio_start_writeback(folio);
>  	folio_unlock(folio);
>  
> -- 
> 2.37.3
>
Matthew Wilcox (Oracle) Oct. 28, 2022, 6:15 p.m. UTC | #3
On Fri, Oct 28, 2022 at 10:01:19AM -0700, Darrick J. Wong wrote:
> On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> > Performance testing of below fio workload reveals ~16x performance
> > improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> > FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> > 
> > <test_randwrite.fio>
> > [global]
> > 	ioengine=psync
> > 	rw=randwrite
> > 	overwrite=1
> > 	pre_read=1
> > 	direct=0
> > 	bs=4k
> > 	size=1G
> > 	dir=./
> > 	numjobs=8
> > 	fdatasync=1
> > 	runtime=60
> > 	iodepth=64
> > 	group_reporting=1
>
> Admittedly I'm not thrilled at the reintroduction of page and iop dirty
> state that are updated in separate places, but OTOH the write
> amplification here is demonstrably horrifying as you point out so it's
> clearly necessary.

Well, *something* is necessary.  I worked on a different approach that
would have similar effects for this exact workload, which was to submit
the I/O for O_SYNC while we still know which part of the page we
dirtied.

Previous discussion:
https://lore.kernel.org/all/YQlgjh2R8OzJkFoB@casper.infradead.org/

Actual patches:
https://lore.kernel.org/all/20220503064008.3682332-1-willy@infradead.org/
Dave Chinner Oct. 28, 2022, 9:04 p.m. UTC | #4
On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> filesystem blocksize, this patch should improve the performance by doing
> only the subpage dirty data write.
> 
> This should also reduce the write amplification since we can now track
> subpage dirty status within state bitmaps. Earlier we had to
> write the entire 64k page even if only a part of it (e.g. 4k) was
> updated.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>

To me, this is a fundamental architecture change in the way iomap
interfaces with the page cache and filesystems. Folio based dirty
tracking is top down, whilst filesystem block based dirty tracking
*needs* to be bottom up.

The bottom up approach is what bufferheads do, and it requires a
much bigger change that just adding dirty region tracking to the
iomap write and writeback paths.

That is, moving to tracking dirty regions on a filesystem block
boundary brings back all the coherency problems we had with
trying to keep bufferhead dirty state coherent with page dirty
state. This was one of the major simplifications that the iomap
infrastructure brought to the table - all the dirty tracking is done
by the page cache, and the filesystem has nothing to do with it at
all....

IF we are going to change this, then there needs to be clear rules
on how iomap dirty state is kept coherent with the folio dirty
state, and there need to be checks placed everywhere to ensure that
the rules are followed and enforced.

So what are the rules? If the folio is dirty, it must have at least one
dirty region? If the folio is clean, can it have dirty regions?

What happens to the dirty regions when truncate zeros part of a page
beyond EOF? If the iomap regions are clean, do they need to be
dirtied? If the regions are dirtied, do they need to be cleaned?
Does this hold for all trailing filesystem blocks in the (multipage)
folio, of just the one that spans the new EOF?

What happens with direct extent manipulation like fallocate()
operations? These invalidate the parts of the page cache over the
range we are punching, shifting, etc, without interacting directly
with iomap, so do we now have to ensure that the sub-folio dirty
regions are also invalidated correctly? i.e. do functions like
xfs_flush_unmap_range() need to become iomap infrastructure so that
they can update sub-folio dirty ranges correctly?

What about the
folio_mark_dirty()/filemap_dirty_folio()/.folio_dirty()
infrastructure? iomap currently treats this as top down, so it
doesn't actually call back into iomap to mark filesystem blocks
dirty. This would need to be rearchitected to match
block_dirty_folio() where the bufferheads on the page are marked
dirty before the folio is marked dirty by external operations....

The easy part of this problem is tracking dirty state on a
filesystem block boundaries. The *hard part* maintaining coherency
with the page cache, and none of that has been done yet. I'd prefer
that we deal with this problem once and for all at the page cache
level because multi-page folios mean even when the filesystem block
is the same as PAGE_SIZE, we have this sub-folio block granularity
tracking issue.

As it is, we already have the capability for the mapping tree to
have multiple indexes pointing to the same folio - perhaps it's time
to start thinking about using filesystem blocks as the mapping tree
index rather than PAGE_SIZE chunks, so that the page cache can then
track dirty state on filesystem block boundaries natively and
this whole problem goes away. We have to solve this sub-folio dirty
tracking problem for multi-page folios anyway, so it seems to me
that we should solve the sub-page block size dirty tracking problem
the same way....

Cheers,

Dave.
Ritesh Harjani (IBM) Oct. 29, 2022, 3:05 a.m. UTC | #5
On 22/10/28 01:42PM, Matthew Wilcox wrote:
> On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> > @@ -1354,7 +1399,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  	 * invalid, grab a new one.
> >  	 */
> >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> > -		if (iop && !test_bit(i, iop->state))
> > +		if (iop && (!test_bit(i, iop->state) ||
> > +			    !test_bit(i + nblocks, iop->state)))
> >  			continue;
> >  
> >  		error = wpc->ops->map_blocks(wpc, inode, pos);
> 
> Why do we need to test both uptodate and dirty?  Surely we only need to
> test the dirty bit?  How can a !uptodate block ever be marked as dirty?

Yes, you are right. We don't need to test uptodate bit. 
In later revisions, I will correct that.

> 
> More generally, I think open-coding this is going to lead to confusion.
> We need wrappers like 'iop_block_dirty()' and 'iop_block_uptodate()'.

Sure. Make sense. Thanks for the suggestion.


> (iop is still a bad name for this, but nobody's stepped up with a better
> one yet).

Looks fine to me :)

-ritesh
Ritesh Harjani (IBM) Oct. 29, 2022, 3:25 a.m. UTC | #6
On 22/10/28 10:01AM, Darrick J. Wong wrote:
> On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> > On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> > filesystem blocksize, this patch should improve the performance by doing
> > only the subpage dirty data write.
> > 
> > This should also reduce the write amplification since we can now track
> > subpage dirty status within state bitmaps. Earlier we had to
> > write the entire 64k page even if only a part of it (e.g. 4k) was
> > updated.
> > 
> > Performance testing of below fio workload reveals ~16x performance
> > improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> > FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> > 
> > <test_randwrite.fio>
> > [global]
> > 	ioengine=psync
> > 	rw=randwrite
> > 	overwrite=1
> > 	pre_read=1
> > 	direct=0
> > 	bs=4k
> > 	size=1G
> > 	dir=./
> > 	numjobs=8
> > 	fdatasync=1
> > 	runtime=60
> > 	iodepth=64
> > 	group_reporting=1
> > 
> > [fio-run]
> > 
> > Reported-by: Aravinda Herle <araherle@in.ibm.com>
> > Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> > ---
> >  fs/iomap/buffered-io.c | 53 ++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 51 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index 255f9f92668c..31ee80a996b2 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -58,7 +58,7 @@ iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
> >  	else
> >  		gfp = GFP_NOFS | __GFP_NOFAIL;
> >  
> > -	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
> > +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
> >  		      gfp);
> >  	if (iop) {
> >  		spin_lock_init(&iop->state_lock);
> > @@ -168,6 +168,48 @@ static void iomap_set_range_uptodate(struct folio *folio,
> >  		folio_mark_uptodate(folio);
> >  }
> >  
> > +static void iomap_iop_set_range_dirty(struct folio *folio,
> > +		struct iomap_page *iop, size_t off, size_t len)
> > +{
> > +	struct inode *inode = folio->mapping->host;
> > +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> > +	unsigned first = (off >> inode->i_blkbits) + nr_blocks;
> > +	unsigned last = ((off + len - 1) >> inode->i_blkbits) + nr_blocks;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&iop->state_lock, flags);
> > +	bitmap_set(iop->state, first, last - first + 1);
> > +	spin_unlock_irqrestore(&iop->state_lock, flags);
> > +}
> > +
> > +static void iomap_set_range_dirty(struct folio *folio,
> > +		struct iomap_page *iop, size_t off, size_t len)
> > +{
> > +	if (iop)
> > +		iomap_iop_set_range_dirty(folio, iop, off, len);
> > +}
> > +
> > +static void iomap_iop_clear_range_dirty(struct folio *folio,
> > +		struct iomap_page *iop, size_t off, size_t len)
> > +{
> > +	struct inode *inode = folio->mapping->host;
> > +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> > +	unsigned first = (off >> inode->i_blkbits) + nr_blocks;
> > +	unsigned last = ((off + len - 1) >> inode->i_blkbits) + nr_blocks;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&iop->state_lock, flags);
> > +	bitmap_clear(iop->state, first, last - first + 1);
> > +	spin_unlock_irqrestore(&iop->state_lock, flags);
> > +}
> > +
> > +static void iomap_clear_range_dirty(struct folio *folio,
> > +		struct iomap_page *iop, size_t off, size_t len)
> > +{
> > +	if (iop)
> > +		iomap_iop_clear_range_dirty(folio, iop, off, len);
> > +}
> > +
> >  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
> >  		size_t len, int error)
> >  {
> > @@ -665,6 +707,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
> >  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
> >  		return 0;
> >  	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
> > +	iomap_set_range_dirty(folio, iop, offset_in_folio(folio, pos), len);
> >  	filemap_dirty_folio(inode->i_mapping, folio);
> >  	return copied;
> >  }
> > @@ -979,6 +1022,8 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
> >  		block_commit_write(&folio->page, 0, length);
> >  	} else {
> >  		WARN_ON_ONCE(!folio_test_uptodate(folio));
> > +		iomap_set_range_dirty(folio, to_iomap_page(folio),
> > +				offset_in_folio(folio, iter->pos), length);
> >  		folio_mark_dirty(folio);
> >  	}
> >  
> > @@ -1354,7 +1399,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  	 * invalid, grab a new one.
> >  	 */
> >  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> > -		if (iop && !test_bit(i, iop->state))
> > +		if (iop && (!test_bit(i, iop->state) ||
> > +			    !test_bit(i + nblocks, iop->state)))
> 
> Hmm.  So I /think/ these two test_bit()s mean that we skip any folio
> sub-block if it's either notuptodate or not dirty?
> 
> I /think/ we only need to check the dirty status, right?  Like willy
> said? :)

Yes. Agreed.

> 
> That said... somewhere we probably ought to check the consistency of the
> two bits to ensure that they're not (dirty && !uptodate), given our
> horrible history of getting things wrong with page and bufferhead state
> bits.
> 
> Admittedly I'm not thrilled at the reintroduction of page and iop dirty
> state that are updated in separate places, but OTOH the write
> amplification here is demonstrably horrifying as you point out so it's
> clearly necessary.

On a 64K pagesize platform the performance of such workloads that I meantion is
also quiet bad. 


> 
> Maybe we need a debugging function that will check the page and iop
> state, and call it every time we go in and out of critical iomap
> functions (write, writeback, dropping pages, etc)

I will try and review each of the paths once again to ensure the consistency. 
What I see is, we only mark the iop->state dirty bits before dirtying the page
in iomap buffered-io paths. This happens at two places,
1. __iomap_write_end() where we call filemap_dirty_folio(). We mark iop state
   dirty bits before calling filemap_dirty_folio()
2. iomap_folio_mkwrite_iter(). Here again before calling folio_mark_dirty(), we
   set the dirty state bits. This is the iomap_page_mkwrite path.

But, I would still like to review each of these and other paths as well.

-ritesh


> 
> --D
> 
> >  			continue;
> >  
> >  		error = wpc->ops->map_blocks(wpc, inode, pos);
> > @@ -1397,6 +1443,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  		}
> >  	}
> >  
> > +	iomap_clear_range_dirty(folio, iop,
> > +				offset_in_folio(folio, folio_pos(folio)),
> > +				end_pos - folio_pos(folio));
> >  	folio_start_writeback(folio);
> >  	folio_unlock(folio);
> >  
> > -- 
> > 2.37.3
> >
Ritesh Harjani (IBM) Oct. 30, 2022, 3:27 a.m. UTC | #7
On 22/10/29 08:04AM, Dave Chinner wrote:
> On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> > On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> > filesystem blocksize, this patch should improve the performance by doing
> > only the subpage dirty data write.
> > 
> > This should also reduce the write amplification since we can now track
> > subpage dirty status within state bitmaps. Earlier we had to
> > write the entire 64k page even if only a part of it (e.g. 4k) was
> > updated.
> > 
> > Performance testing of below fio workload reveals ~16x performance
> > improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> > FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> > 
> > <test_randwrite.fio>
> > [global]
> > 	ioengine=psync
> > 	rw=randwrite
> > 	overwrite=1
> > 	pre_read=1
> > 	direct=0
> > 	bs=4k
> > 	size=1G
> > 	dir=./
> > 	numjobs=8
> > 	fdatasync=1
> > 	runtime=60
> > 	iodepth=64
> > 	group_reporting=1
> > 
> > [fio-run]
> > 
> > Reported-by: Aravinda Herle <araherle@in.ibm.com>
> > Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> 
> To me, this is a fundamental architecture change in the way iomap
> interfaces with the page cache and filesystems. Folio based dirty
> tracking is top down, whilst filesystem block based dirty tracking
> *needs* to be bottom up.
> 
> The bottom up approach is what bufferheads do, and it requires a
> much bigger change that just adding dirty region tracking to the
> iomap write and writeback paths.
> 
> That is, moving to tracking dirty regions on a filesystem block
> boundary brings back all the coherency problems we had with
> trying to keep bufferhead dirty state coherent with page dirty
> state. This was one of the major simplifications that the iomap
> infrastructure brought to the table - all the dirty tracking is done

Sure, but then with simplified iomap design these optimization in the 
workload that I mentioned are also lost :(

> by the page cache, and the filesystem has nothing to do with it at
> all....
> 
> IF we are going to change this, then there needs to be clear rules
> on how iomap dirty state is kept coherent with the folio dirty
> state, and there need to be checks placed everywhere to ensure that
> the rules are followed and enforced.

Sure.

> 
> So what are the rules? If the folio is dirty, it must have at least one
> dirty region? If the folio is clean, can it have dirty regions?
> 
> What happens to the dirty regions when truncate zeros part of a page
> beyond EOF? If the iomap regions are clean, do they need to be
> dirtied? If the regions are dirtied, do they need to be cleaned?
> Does this hold for all trailing filesystem blocks in the (multipage)
> folio, of just the one that spans the new EOF?
> 
> What happens with direct extent manipulation like fallocate()
> operations? These invalidate the parts of the page cache over the
> range we are punching, shifting, etc, without interacting directly
> with iomap, so do we now have to ensure that the sub-folio dirty
> regions are also invalidated correctly? i.e. do functions like
> xfs_flush_unmap_range() need to become iomap infrastructure so that
> they can update sub-folio dirty ranges correctly?
> 
> What about the
> folio_mark_dirty()/filemap_dirty_folio()/.folio_dirty()
> infrastructure? iomap currently treats this as top down, so it
> doesn't actually call back into iomap to mark filesystem blocks
> dirty. This would need to be rearchitected to match
> block_dirty_folio() where the bufferheads on the page are marked
> dirty before the folio is marked dirty by external operations....

Sure thanks for clearly listing out all of the paths. 
Let me carefully review these paths to check on, how does adding a state 
bitmap to iomap_page for dirty tracking is handled in every case which you 
mentioned above. I would like to ensure, that we have reviewed all the 
paths and functionally + theoritically this approach at least works fine.
(Mainly I wanted to go over the truncate & fallocate paths that you listed above).


> 
> The easy part of this problem is tracking dirty state on a
> filesystem block boundaries. The *hard part* maintaining coherency
> with the page cache, and none of that has been done yet. I'd prefer
> that we deal with this problem once and for all at the page cache
> level because multi-page folios mean even when the filesystem block
> is the same as PAGE_SIZE, we have this sub-folio block granularity
> tracking issue.
> 
> As it is, we already have the capability for the mapping tree to
> have multiple indexes pointing to the same folio - perhaps it's time
> to start thinking about using filesystem blocks as the mapping tree
> index rather than PAGE_SIZE chunks, so that the page cache can then
> track dirty state on filesystem block boundaries natively and
> this whole problem goes away. We have to solve this sub-folio dirty
> tracking problem for multi-page folios anyway, so it seems to me
> that we should solve the sub-page block size dirty tracking problem
> the same way....
> 
> Cheers,
> 
> Dave.

Thanks a lot Dave for your review comments. You have listed out few other
points which I am not commenting on yet, since I would like to review those 
carefully. I am currently on travel and will be back in a few days. 
Once I am back, let me study this area more based on your comments and will 
get back to you on those points as well.

-ritesh

> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Oct. 30, 2022, 10:31 p.m. UTC | #8
On Sun, Oct 30, 2022 at 08:57:58AM +0530, Ritesh Harjani (IBM) wrote:
> On 22/10/29 08:04AM, Dave Chinner wrote:
> > On Fri, Oct 28, 2022 at 10:00:33AM +0530, Ritesh Harjani (IBM) wrote:
> > > On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> > > filesystem blocksize, this patch should improve the performance by doing
> > > only the subpage dirty data write.
> > > 
> > > This should also reduce the write amplification since we can now track
> > > subpage dirty status within state bitmaps. Earlier we had to
> > > write the entire 64k page even if only a part of it (e.g. 4k) was
> > > updated.
> > > 
> > > Performance testing of below fio workload reveals ~16x performance
> > > improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> > > FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> > > 
> > > <test_randwrite.fio>
> > > [global]
> > > 	ioengine=psync
> > > 	rw=randwrite
> > > 	overwrite=1
> > > 	pre_read=1
> > > 	direct=0
> > > 	bs=4k
> > > 	size=1G
> > > 	dir=./
> > > 	numjobs=8
> > > 	fdatasync=1
> > > 	runtime=60
> > > 	iodepth=64
> > > 	group_reporting=1
> > > 
> > > [fio-run]
> > > 
> > > Reported-by: Aravinda Herle <araherle@in.ibm.com>
> > > Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> > 
> > To me, this is a fundamental architecture change in the way iomap
> > interfaces with the page cache and filesystems. Folio based dirty
> > tracking is top down, whilst filesystem block based dirty tracking
> > *needs* to be bottom up.
> > 
> > The bottom up approach is what bufferheads do, and it requires a
> > much bigger change that just adding dirty region tracking to the
> > iomap write and writeback paths.
> > 
> > That is, moving to tracking dirty regions on a filesystem block
> > boundary brings back all the coherency problems we had with
> > trying to keep bufferhead dirty state coherent with page dirty
> > state. This was one of the major simplifications that the iomap
> > infrastructure brought to the table - all the dirty tracking is done
> 
> Sure, but then with simplified iomap design these optimization in the 
> workload that I mentioned are also lost :(

Yes, we knew that when we first started planning to get rid of
bufferheads from XFS. That was, what, 7 years ago we started down
that path, and it's been that way in production systems since at
least 4.19.

> > by the page cache, and the filesystem has nothing to do with it at
> > all....
> > 
> > IF we are going to change this, then there needs to be clear rules
> > on how iomap dirty state is kept coherent with the folio dirty
> > state, and there need to be checks placed everywhere to ensure that
> > the rules are followed and enforced.
> 
> Sure.
> 
> > 
> > So what are the rules? If the folio is dirty, it must have at least one
> > dirty region? If the folio is clean, can it have dirty regions?
> > 
> > What happens to the dirty regions when truncate zeros part of a page
> > beyond EOF? If the iomap regions are clean, do they need to be
> > dirtied? If the regions are dirtied, do they need to be cleaned?
> > Does this hold for all trailing filesystem blocks in the (multipage)
> > folio, of just the one that spans the new EOF?
> > 
> > What happens with direct extent manipulation like fallocate()
> > operations? These invalidate the parts of the page cache over the
> > range we are punching, shifting, etc, without interacting directly
> > with iomap, so do we now have to ensure that the sub-folio dirty
> > regions are also invalidated correctly? i.e. do functions like
> > xfs_flush_unmap_range() need to become iomap infrastructure so that
> > they can update sub-folio dirty ranges correctly?
> > 
> > What about the
> > folio_mark_dirty()/filemap_dirty_folio()/.folio_dirty()
> > infrastructure? iomap currently treats this as top down, so it
> > doesn't actually call back into iomap to mark filesystem blocks
> > dirty. This would need to be rearchitected to match
> > block_dirty_folio() where the bufferheads on the page are marked
> > dirty before the folio is marked dirty by external operations....
> 
> Sure thanks for clearly listing out all of the paths. 
> Let me carefully review these paths to check on, how does adding a state 
> bitmap to iomap_page for dirty tracking is handled in every case which you 
> mentioned above. I would like to ensure, that we have reviewed all the 
> paths and functionally + theoritically this approach at least works fine.
> (Mainly I wanted to go over the truncate & fallocate paths that you listed above).

I'm kinda pointing it out all this as the reasons why we don't want
to go down this path again - per-filesystem block dirty tracking was
the single largest source of data corruption bugs in XFS back in the
days of bufferheads...

I really, really don't want the iomap infrastructure to head back in
that direction - we know it leads to on-going data corruption pain
because that's where we came from to get here.

I would much prefer we move to a different model for semi-random
sub-page writes. That was also Willy's suggestion - async
write-through caching (so the page never goes through a dirty state
for sub-page writes) is a much better model for modern high
performance storage systems than the existing buffered writeback
model....

Cheers,

Dave.
Matthew Wilcox (Oracle) Oct. 31, 2022, 3:43 a.m. UTC | #9
On Sat, Oct 29, 2022 at 08:04:22AM +1100, Dave Chinner wrote:
> To me, this is a fundamental architecture change in the way iomap
> interfaces with the page cache and filesystems. Folio based dirty
> tracking is top down, whilst filesystem block based dirty tracking
> *needs* to be bottom up.
> 
> The bottom up approach is what bufferheads do, and it requires a
> much bigger change that just adding dirty region tracking to the
> iomap write and writeback paths.

I agree that bufferheads do bottom-up dirty tracking, but I don't think
that what Ritesh is doing here is bottom-up dirty tracking.  Buffer
heads expose an API to dirty a block, which necessarily goes bottom-up.
There's no API here to dirty a block.  Instead there's an API to dirty
a range of a folio, so we're still top-down; we're just keeping track
of it in a more precise way.

It's a legitimate complaint that there's now state that needs to be
kept in sync with the page cache.  More below ...

> That is, moving to tracking dirty regions on a filesystem block
> boundary brings back all the coherency problems we had with
> trying to keep bufferhead dirty state coherent with page dirty
> state. This was one of the major simplifications that the iomap
> infrastructure brought to the table - all the dirty tracking is done
> by the page cache, and the filesystem has nothing to do with it at
> all....
> 
> IF we are going to change this, then there needs to be clear rules
> on how iomap dirty state is kept coherent with the folio dirty
> state, and there need to be checks placed everywhere to ensure that
> the rules are followed and enforced.
> 
> So what are the rules? If the folio is dirty, it must have at least one
> dirty region? If the folio is clean, can it have dirty regions?

If there is any dirty region, the folio must be marked dirty (otherwise
we'll never know that it needs to be written back).  The interesting
question (as your paragraph below hints) is whether removing the dirty
part of a folio from a file marks the folio clean.  I believe that's
optional, but it's probably worth doing.

> What happens to the dirty regions when truncate zeros part of a page
> beyond EOF? If the iomap regions are clean, do they need to be
> dirtied? If the regions are dirtied, do they need to be cleaned?
> Does this hold for all trailing filesystem blocks in the (multipage)
> folio, of just the one that spans the new EOF?
> 
> What happens with direct extent manipulation like fallocate()
> operations? These invalidate the parts of the page cache over the
> range we are punching, shifting, etc, without interacting directly
> with iomap, so do we now have to ensure that the sub-folio dirty
> regions are also invalidated correctly? i.e. do functions like
> xfs_flush_unmap_range() need to become iomap infrastructure so that
> they can update sub-folio dirty ranges correctly?

I'm slightly confused by this question.  As I understand the various
fallocate operations, they start by kicking out all the folios affected
by the operation (generally from the start of the operation to EOF),
so we'd writeback the (dirty part of) folios which are dirty, then
invalidate the folios in cache.  I'm not sure there's going to be
much difference.

> What about the
> folio_mark_dirty()/filemap_dirty_folio()/.folio_dirty()
> infrastructure? iomap currently treats this as top down, so it
> doesn't actually call back into iomap to mark filesystem blocks
> dirty. This would need to be rearchitected to match
> block_dirty_folio() where the bufferheads on the page are marked
> dirty before the folio is marked dirty by external operations....

Yes.  This is also going to be a performance problem.  Marking a folio as
dirty is no longer just setting the bit in struct folio and the xarray
but also setting all the bits in iop->state.  Depending on the size
of the folio, and the fs blocksize, this could be quite a lot of bits.
eg a 2MB folio with a 1k block size is 2048 bits (256 bytes, 6 cachelines
(it dirties the spinlock in cacheline 0, then the bitmap occupies 3 full
cachelines and 2 partial ones)).

I don't see the necessary churn from filemap_dirty_folio() to
iomap_dirty_folio() as being a huge deal, but it's definitely a missing
piece from this RFC.

> The easy part of this problem is tracking dirty state on a
> filesystem block boundaries. The *hard part* maintaining coherency
> with the page cache, and none of that has been done yet. I'd prefer
> that we deal with this problem once and for all at the page cache
> level because multi-page folios mean even when the filesystem block
> is the same as PAGE_SIZE, we have this sub-folio block granularity
> tracking issue.
> 
> As it is, we already have the capability for the mapping tree to
> have multiple indexes pointing to the same folio - perhaps it's time
> to start thinking about using filesystem blocks as the mapping tree
> index rather than PAGE_SIZE chunks, so that the page cache can then
> track dirty state on filesystem block boundaries natively and
> this whole problem goes away. We have to solve this sub-folio dirty
> tracking problem for multi-page folios anyway, so it seems to me
> that we should solve the sub-page block size dirty tracking problem
> the same way....

That's an interesting proposal.  From the page cache's point of
view right now, there is only one dirty bit per folio, not per page.
Anything you see contrary to that is old code that needs to be converted.
So even indexing the page cache by block offset rather than page offset
wouldn't help.

We have a number of people looking at the analogous problem for network
filesystems right now.  Dave Howells' netfs infrastructure is trying
to solve the problem for everyone (and he's been looking at iomap as
inspiration for what he's doing).  I'm kind of hoping we end up with one
unified solution that can be used for all filesystems that want sub-folio
dirty tracking.  His solution is a bit more complex than I really want
to see, at least partially because he's trying to track dirtiness at
byte granularity, no matter how much pain that causes to the server.
Dave Chinner Oct. 31, 2022, 7:08 a.m. UTC | #10
On Mon, Oct 31, 2022 at 03:43:24AM +0000, Matthew Wilcox wrote:
> On Sat, Oct 29, 2022 at 08:04:22AM +1100, Dave Chinner wrote:
> > As it is, we already have the capability for the mapping tree to
> > have multiple indexes pointing to the same folio - perhaps it's time
> > to start thinking about using filesystem blocks as the mapping tree
> > index rather than PAGE_SIZE chunks, so that the page cache can then
> > track dirty state on filesystem block boundaries natively and
> > this whole problem goes away. We have to solve this sub-folio dirty
> > tracking problem for multi-page folios anyway, so it seems to me
> > that we should solve the sub-page block size dirty tracking problem
> > the same way....
> 
> That's an interesting proposal.  From the page cache's point of
> view right now, there is only one dirty bit per folio, not per page.

Per folio, yes, but I thought we also had a dirty bit per index
entry in the mapping tree. Writeback code uses the
PAGECACHE_TAG_DIRTY mark to find the dirty folios efficiently (i.e.
the write_cache_pages() iterator), so it's not like this is
something new. i.e. we already have coherent, external dirty bit
tracking mechanisms outside the folio itself that filesystems
use.

That's kinda what I'm getting at here - we already have coherent
dirty state tracking outside of the individual folios themselves.
Hence if we have to track sub-folio up-to-date state, sub-folio
dirty state and, potentially, sub-folio writeback state outside the
folio itself, why not do it by extending the existing coherent dirty
state tracking that is built into the mapping tree itself?

Folios + Xarray have given us the ability to disconnect the size of
the cached item at any given index from the index granularity - why
not extend that down to sub-page folio granularity in addition to
the scaling up we've been doing for large (multipage) folio
mappings?

Then we don't need any sort of filesystem specific "add-on" that sits
alongside the mapping tree that tries to keep track of dirty state
in addition to the folio and the mapping tree tracking that already
exists...

> We have a number of people looking at the analogous problem for network
> filesystems right now.  Dave Howells' netfs infrastructure is trying
> to solve the problem for everyone (and he's been looking at iomap as
> inspiration for what he's doing).  I'm kind of hoping we end up with one
> unified solution that can be used for all filesystems that want sub-folio
> dirty tracking.  His solution is a bit more complex than I really want
> to see, at least partially because he's trying to track dirtiness at
> byte granularity, no matter how much pain that causes to the server.

Byte range granularity is probably overkill for block based
filesystems - all we need is a couple of extra bits per block to be
stored in the mapping tree alongside the folio....

Cheers,

Dave.
Matthew Wilcox (Oracle) Oct. 31, 2022, 10:27 a.m. UTC | #11
On Mon, Oct 31, 2022 at 06:08:53PM +1100, Dave Chinner wrote:
> On Mon, Oct 31, 2022 at 03:43:24AM +0000, Matthew Wilcox wrote:
> > On Sat, Oct 29, 2022 at 08:04:22AM +1100, Dave Chinner wrote:
> > > As it is, we already have the capability for the mapping tree to
> > > have multiple indexes pointing to the same folio - perhaps it's time
> > > to start thinking about using filesystem blocks as the mapping tree
> > > index rather than PAGE_SIZE chunks, so that the page cache can then
> > > track dirty state on filesystem block boundaries natively and
> > > this whole problem goes away. We have to solve this sub-folio dirty
> > > tracking problem for multi-page folios anyway, so it seems to me
> > > that we should solve the sub-page block size dirty tracking problem
> > > the same way....
> > 
> > That's an interesting proposal.  From the page cache's point of
> > view right now, there is only one dirty bit per folio, not per page.
> 
> Per folio, yes, but I thought we also had a dirty bit per index
> entry in the mapping tree. Writeback code uses the
> PAGECACHE_TAG_DIRTY mark to find the dirty folios efficiently (i.e.
> the write_cache_pages() iterator), so it's not like this is
> something new. i.e. we already have coherent, external dirty bit
> tracking mechanisms outside the folio itself that filesystems
> use.

That bit only exists (logically) for the canonical entry.  Physically
it exists for sibling entries, but it's not used; attempting to set
it on sibling entries will redirect to set it on the canonical entry.
That could be changed, but we elide entire layers of the tree once the
entry has a sufficiently high order.  So an order-6 folio occupies
a single slot one layer up; an order-7 folio occupies two slots, an
order-8 folio occupies four slots and so on.

My eventual goal is to ditch the radix tree and use the Maple Tree
(ie a B-tree), and that will always only have one slot per folio, no
matter what order it has.  Then there really only will be one bit per
folio.

> > We have a number of people looking at the analogous problem for network
> > filesystems right now.  Dave Howells' netfs infrastructure is trying
> > to solve the problem for everyone (and he's been looking at iomap as
> > inspiration for what he's doing).  I'm kind of hoping we end up with one
> > unified solution that can be used for all filesystems that want sub-folio
> > dirty tracking.  His solution is a bit more complex than I really want
> > to see, at least partially because he's trying to track dirtiness at
> > byte granularity, no matter how much pain that causes to the server.
> 
> Byte range granularity is probably overkill for block based
> filesystems - all we need is a couple of extra bits per block to be
> stored in the mapping tree alongside the folio....

I think it's overkill for network filesystems too.  By sending a
sector-misaligned write to the server, you force the server to do a R-M-W
before it commits the write to storage.  Assuming that the file has fallen
out of the server's cache, and a sufficiently busy server probably doesn't
have the memory capacity for the working set of all of its clients.

Anyway, Dave's plan for dirty tracking (as I understand the current
iteration) is to not store it linked from folio->private at all, but to
store it in a per-file tree of writes.  Then we wouldn't walk the page
cache looking for dirty folios, but walk the tree of writes choosing
which ones to write back and delete from the tree.  I don't know how
this will perform in practice, but it'll be generic enough to work for
any filesystem.
Christoph Hellwig Nov. 2, 2022, 8:57 a.m. UTC | #12
On Mon, Oct 31, 2022 at 10:27:16AM +0000, Matthew Wilcox wrote:
> > Byte range granularity is probably overkill for block based
> > filesystems - all we need is a couple of extra bits per block to be
> > stored in the mapping tree alongside the folio....
> 
> I think it's overkill for network filesystems too.  By sending a
> sector-misaligned write to the server, you force the server to do a R-M-W
> before it commits the write to storage.  Assuming that the file has fallen
> out of the server's cache, and a sufficiently busy server probably doesn't
> have the memory capacity for the working set of all of its clients.

That really depends on your server.  For NFS there's definitively
servers that can deal with unaligned writes fairly well because they
just log the data in non volatile memory.  That being said I'm not sure
it really is worth to optimize the Linux pagecache for that particular
use case.

> Anyway, Dave's plan for dirty tracking (as I understand the current
> iteration) is to not store it linked from folio->private at all, but to
> store it in a per-file tree of writes.  Then we wouldn't walk the page
> cache looking for dirty folios, but walk the tree of writes choosing
> which ones to write back and delete from the tree.  I don't know how
> this will perform in practice, but it'll be generic enough to work for
> any filesystem.

Yes, this would be generic.  But having multiple tracking trees might
not be super optimal - it always reminds me of the btrfs I/O code that
is lost in a maze of trees and performs rather suboptimal.
Christoph Hellwig Nov. 2, 2022, 9:03 a.m. UTC | #13
On Mon, Oct 31, 2022 at 03:43:24AM +0000, Matthew Wilcox wrote:
> I agree that bufferheads do bottom-up dirty tracking, but I don't think
> that what Ritesh is doing here is bottom-up dirty tracking.  Buffer
> heads expose an API to dirty a block, which necessarily goes bottom-up.
> There's no API here to dirty a block.  Instead there's an API to dirty
> a range of a folio, so we're still top-down; we're just keeping track
> of it in a more precise way.

Agreed.

> If there is any dirty region, the folio must be marked dirty (otherwise
> we'll never know that it needs to be written back).  The interesting
> question (as your paragraph below hints) is whether removing the dirty
> part of a folio from a file marks the folio clean.  I believe that's
> optional, but it's probably worth doing.

Also agreed.

> > What happens with direct extent manipulation like fallocate()
> > operations? These invalidate the parts of the page cache over the
> > range we are punching, shifting, etc, without interacting directly
> > with iomap, so do we now have to ensure that the sub-folio dirty
> > regions are also invalidated correctly? i.e. do functions like
> > xfs_flush_unmap_range() need to become iomap infrastructure so that
> > they can update sub-folio dirty ranges correctly?
> 
> I'm slightly confused by this question.  As I understand the various
> fallocate operations, they start by kicking out all the folios affected
> by the operation (generally from the start of the operation to EOF),
> so we'd writeback the (dirty part of) folios which are dirty, then
> invalidate the folios in cache.  I'm not sure there's going to be
> much difference.

Yes.  As far as I can tell all pagecache manipulation for the
fallocate operations is driven by the file system and it is
only done by those the punch/zero/move ranges.  The file system
then goes though the normal pagecache truncate helpers rounded to
the block size, which through the ops should do the right thing.


> Yes.  This is also going to be a performance problem.  Marking a folio as
> dirty is no longer just setting the bit in struct folio and the xarray
> but also setting all the bits in iop->state.  Depending on the size
> of the folio, and the fs blocksize, this could be quite a lot of bits.
> eg a 2MB folio with a 1k block size is 2048 bits (256 bytes, 6 cachelines
> (it dirties the spinlock in cacheline 0, then the bitmap occupies 3 full
> cachelines and 2 partial ones)).

We can always optimize by having a bit for the fairly common all dirty
case and only track and look at the array if that is no the case.

> filesystems right now.  Dave Howells' netfs infrastructure is trying
> to solve the problem for everyone (and he's been looking at iomap as
> inspiration for what he's doing).

Btw, I never understod why the network file systems don't just use
iomap.  There is nothing block specific in the core iomap code.
Darrick J. Wong Nov. 2, 2022, 5:35 p.m. UTC | #14
On Wed, Nov 02, 2022 at 02:03:11AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 31, 2022 at 03:43:24AM +0000, Matthew Wilcox wrote:
> > I agree that bufferheads do bottom-up dirty tracking, but I don't think
> > that what Ritesh is doing here is bottom-up dirty tracking.  Buffer
> > heads expose an API to dirty a block, which necessarily goes bottom-up.
> > There's no API here to dirty a block.  Instead there's an API to dirty
> > a range of a folio, so we're still top-down; we're just keeping track
> > of it in a more precise way.
> 
> Agreed.

Me three.  Er, too.

Unlike bufferheads which are scattered all over the kernel, the details
of dirty state tracking are now confined to buffered-io.c, and the
external functions remain the same.  From my POV that makes the dirty
state an implementation detail of iomap that callers don't have to care
about.

Just so long as nobody imports that nonfeature of the bufferhead code
where dirtying an already dirty bufferhead skips marking the folio dirty
and writepage failures also fail to redirty the page.  That bit us hard
recently and I strongly prefer not to repeat that.

> > If there is any dirty region, the folio must be marked dirty (otherwise
> > we'll never know that it needs to be written back).  The interesting
> > question (as your paragraph below hints) is whether removing the dirty
> > part of a folio from a file marks the folio clean.  I believe that's
> > optional, but it's probably worth doing.
> 
> Also agreed.
> 
> > > What happens with direct extent manipulation like fallocate()
> > > operations? These invalidate the parts of the page cache over the
> > > range we are punching, shifting, etc, without interacting directly
> > > with iomap, so do we now have to ensure that the sub-folio dirty
> > > regions are also invalidated correctly? i.e. do functions like
> > > xfs_flush_unmap_range() need to become iomap infrastructure so that
> > > they can update sub-folio dirty ranges correctly?
> > 
> > I'm slightly confused by this question.  As I understand the various
> > fallocate operations, they start by kicking out all the folios affected
> > by the operation (generally from the start of the operation to EOF),
> > so we'd writeback the (dirty part of) folios which are dirty, then
> > invalidate the folios in cache.  I'm not sure there's going to be
> > much difference.
> 
> Yes.  As far as I can tell all pagecache manipulation for the
> fallocate operations is driven by the file system and it is
> only done by those the punch/zero/move ranges.  The file system
> then goes though the normal pagecache truncate helpers rounded to
> the block size, which through the ops should do the right thing.

Yes, AFAICT.

> > Yes.  This is also going to be a performance problem.  Marking a folio as
> > dirty is no longer just setting the bit in struct folio and the xarray
> > but also setting all the bits in iop->state.  Depending on the size
> > of the folio, and the fs blocksize, this could be quite a lot of bits.
> > eg a 2MB folio with a 1k block size is 2048 bits (256 bytes, 6 cachelines
> > (it dirties the spinlock in cacheline 0, then the bitmap occupies 3 full
> > cachelines and 2 partial ones)).
> 
> We can always optimize by having a bit for the fairly common all dirty
> case and only track and look at the array if that is no the case.

Yes, it would help to make the ranges in the bit array better defined
than the semi-opencoded logic there is now.  (I'm whining specifically
about the test_bit calls sprinkled around).  Once that's done it
shouldn't be hard to add one more bit for the all-dirty state.  Though
I'd want to see the numbers to prove that it saves us time anywhere.

--D

> > filesystems right now.  Dave Howells' netfs infrastructure is trying
> > to solve the problem for everyone (and he's been looking at iomap as
> > inspiration for what he's doing).
> 
> Btw, I never understod why the network file systems don't just use
> iomap.  There is nothing block specific in the core iomap code.
Dave Chinner Nov. 3, 2022, 12:38 a.m. UTC | #15
On Wed, Nov 02, 2022 at 01:57:58AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 31, 2022 at 10:27:16AM +0000, Matthew Wilcox wrote:
> > > Byte range granularity is probably overkill for block based
> > > filesystems - all we need is a couple of extra bits per block to be
> > > stored in the mapping tree alongside the folio....
> > 
> > I think it's overkill for network filesystems too.  By sending a
> > sector-misaligned write to the server, you force the server to do a R-M-W
> > before it commits the write to storage.  Assuming that the file has fallen
> > out of the server's cache, and a sufficiently busy server probably doesn't
> > have the memory capacity for the working set of all of its clients.
> 
> That really depends on your server.  For NFS there's definitively
> servers that can deal with unaligned writes fairly well because they
> just log the data in non volatile memory.  That being said I'm not sure
> it really is worth to optimize the Linux pagecache for that particular
> use case.
> 
> > Anyway, Dave's plan for dirty tracking (as I understand the current
> > iteration) is to not store it linked from folio->private at all, but to
> > store it in a per-file tree of writes.  Then we wouldn't walk the page
> > cache looking for dirty folios, but walk the tree of writes choosing
> > which ones to write back and delete from the tree.  I don't know how
> > this will perform in practice, but it'll be generic enough to work for
> > any filesystem.
> 
> Yes, this would be generic.  But having multiple tracking trees might
> not be super optimal - it always reminds me of the btrfs I/O code that
> is lost in a maze of trees and performs rather suboptimal.

Yep, that's kinda what I'm trying to see if we can avoid....

-Dave.
David Howells Nov. 3, 2022, 2:12 p.m. UTC | #16
Matthew Wilcox <willy@infradead.org> wrote:

> ... His solution is a bit more complex than I really want to see, at least
> partially because he's trying to track dirtiness at byte granularity, no
> matter how much pain that causes to the server.

Actually, I'm looking at page-level granularity at the moment.

David
David Howells Nov. 3, 2022, 2:51 p.m. UTC | #17
Christoph Hellwig <hch@infradead.org> wrote:

> > filesystems right now.  Dave Howells' netfs infrastructure is trying
> > to solve the problem for everyone (and he's been looking at iomap as
> > inspiration for what he's doing).
> 
> Btw, I never understod why the network file systems don't just use
> iomap.  There is nothing block specific in the core iomap code.

It calls creates and submits bio structs all over the place.  This seems to
require a blockdev.

Anyway, netfs lib supports, or hopefully will support in the future, the
following:

 (1) Fscache.  netfslib will construct a read you're asking for from cached
     data and data from the server and stitch them together (where a folio may
     comprise pieces from more than once source), and then write the bits it
     read from the server out to the cache...  And handle content encryption
     for you such that the data stored in the cache is content-encrypted.

     On writeback, the dirty data must be written to both the cache (if you
     have one) and the server (if you're not in disconnected operation).

 (2) Disconnected operation.  netfslib will, in the future, handle storing
     data and changes in the cache and then sync'ing on reconnection of an
     object.

 (3) I want to hand persistent (for the life of an op) iov_iters to the
     filesystem so that the filesystem can, if it wants to, pass these to the
     kernel_sendmsg() and kernel_recvmsg() in the bottom.

     The aim is to get knowledge of pages out of the network filesystem
     entirely.  A network filesystem would then provide two basic hooks to the
     server: async direct read and as async direct write.  netfslib will use
     these to access the pagecache on behalf of the filesystem.

 (4) Reads and writes might want to/need to be non-block-size aligned.  If we
     have a byte-range file lock, for example, or if we have a max block size
     (eg. rsize/wsize) set that's not a multiple of 512, say.

 (5) Compressed I/O.  You get back more data than you asked for and you want
     to paste the rest into the pagecache (if buffered) or discard it (if
     DIO).  Further, to make this work on write, we may need to hold on to
     pages on the sides of the one we modified to make sure we keep the right
     size blob of data to recompress and send back.

 (6) Larger cache block granularity.  One thing I want to explore is the
     ability to have blocks in the cache that are larger than PAGE_SIZE.  If I
     can't use the backing filesystem's knowledge of holes in a file, then I
     have to store my own metadata (ie. effectively build a filesystem on top
     of a filesystem).  To reduce that amount of metadata that I need, I can
     make the cache granule size larger.

     In both 5 and 6, netfslib gets to tell the VM layer to increase the size
     of the blob in readahead() - and then may have to forcibly keep the pages
     surrounding the page of interest if it gets modified in order to be able
     to write to the cache correctly, depending on how much integrity I want
     to try and keep in the cache.

 (7) Not-quite-direct-I/O.  cifs, for example, has a number of variations on
     read and write modes that are kind of but not quite direct I/O.

David
Christoph Hellwig Nov. 4, 2022, 7:27 a.m. UTC | #18
On Wed, Nov 02, 2022 at 10:35:38AM -0700, Darrick J. Wong wrote:
> Just so long as nobody imports that nonfeature of the bufferhead code
> where dirtying an already dirty bufferhead skips marking the folio dirty
> and writepage failures also fail to redirty the page.  That bit us hard
> recently and I strongly prefer not to repeat that.

Yes, that absolutely needs to be avoided.

> > We can always optimize by having a bit for the fairly common all dirty
> > case and only track and look at the array if that is no the case.
> 
> Yes, it would help to make the ranges in the bit array better defined
> than the semi-opencoded logic there is now.  (I'm whining specifically
> about the test_bit calls sprinkled around).

That is an absolutely requirement.  It was so obviosu to me that I
didn't bother to restate it after two others already pointed it out :)

> Once that's done it
> shouldn't be hard to add one more bit for the all-dirty state.  Though
> I'd want to see the numbers to prove that it saves us time anywhere.

We might be able to just use PG_private_2 in the page for now, but
maybe just adding a flags field to the iomap_page might be a better
idea, as the pageflags tend to have strange entanglements.
Christoph Hellwig Nov. 4, 2022, 7:30 a.m. UTC | #19
On Thu, Nov 03, 2022 at 02:51:10PM +0000, David Howells wrote:
> > > filesystems right now.  Dave Howells' netfs infrastructure is trying
> > > to solve the problem for everyone (and he's been looking at iomap as
> > > inspiration for what he's doing).
> > 
> > Btw, I never understod why the network file systems don't just use
> > iomap.  There is nothing block specific in the core iomap code.
> 
> It calls creates and submits bio structs all over the place.  This seems to
> require a blockdev.

The core iomap code (fs/iomap/iter.c) does not.  Most users of it
are block device centric right now, but for example the dax.c uses
iomap for byte level DAX accesses without ever looking at a bdev,
and seek.c and fiemap.c do not make any assumptions on the backend
implementation.
Ritesh Harjani (IBM) Nov. 4, 2022, 11:28 a.m. UTC | #20
Thanks everyone for comments / review. Since there are many sub-threads, I will
try and follow up slowly with all the comments/questions raised. 

As for now, I would like to continue the discussion on the current design to
understand what sort of problems lies in maintaining bitmaps within iomap_page 
structure (which is the current design). More on that below...

On 22/10/31 03:43AM, Matthew Wilcox wrote:
> On Sat, Oct 29, 2022 at 08:04:22AM +1100, Dave Chinner wrote:
> > To me, this is a fundamental architecture change in the way iomap
> > interfaces with the page cache and filesystems. Folio based dirty
> > tracking is top down, whilst filesystem block based dirty tracking
> > *needs* to be bottom up.
> > 
> > The bottom up approach is what bufferheads do, and it requires a
> > much bigger change that just adding dirty region tracking to the
> > iomap write and writeback paths.
> 
> I agree that bufferheads do bottom-up dirty tracking, but I don't think
> that what Ritesh is doing here is bottom-up dirty tracking.  Buffer
> heads expose an API to dirty a block, which necessarily goes bottom-up.
> There's no API here to dirty a block.  Instead there's an API to dirty
> a range of a folio, so we're still top-down; we're just keeping track
> of it in a more precise way.

We are still top-down is what I think too. Thanks.

So IIUC, the bottom up approach here means that the I/O could be done using the
underlying buffer_heads without the page/folio knowing about it. 
This creats some sort of coherency problems due to which we needed to mark 
the folio clean in case of no dirty buffers attached to the folio. 
Now since the I/O can be done by using block buffer heads without involving VM, 
and later marking the folio clean, hence we call this as bottom up dirty tracking.

(There are some comments about it in try_to_free_buffers(), __folio_cancel_dirty(),
& buffer_busy() too)

Now since we are not doing anything like above, hence we still follow the
top-down approach.

Please correct if above is not true or if I am missing anything.

> 
> It's a legitimate complaint that there's now state that needs to be
> kept in sync with the page cache.  More below ...

Yes.. I would too like to understand more about this. 


> 
> > That is, moving to tracking dirty regions on a filesystem block
> > boundary brings back all the coherency problems we had with
> > trying to keep bufferhead dirty state coherent with page dirty
> > state. This was one of the major simplifications that the iomap
> > infrastructure brought to the table - all the dirty tracking is done
> > by the page cache, and the filesystem has nothing to do with it at
> > all....
> > 
> > IF we are going to change this, then there needs to be clear rules
> > on how iomap dirty state is kept coherent with the folio dirty
> > state, and there need to be checks placed everywhere to ensure that
> > the rules are followed and enforced.
> > 
> > So what are the rules? If the folio is dirty, it must have at least one
> > dirty region? If the folio is clean, can it have dirty regions?
> 
> If there is any dirty region, the folio must be marked dirty (otherwise
> we'll never know that it needs to be written back).  

Yes. That is my understanding too.

> The interesting question (as your paragraph below hints) is whether removing the dirty
> part of a folio from a file marks the folio clean.  I believe that's
> optional, but it's probably worth doing.

So in the current design, writeback path will anyway call
clear_page_dirty_for_io() before calling iomap_do_writepage() in which we will
clear the dirty bits from iop->state for writing.

But what about folio_cancel_dirty()? Is that what you are referring too here?

> 
> > What happens to the dirty regions when truncate zeros part of a page
> > beyond EOF? If the iomap regions are clean, do they need to be
> > dirtied? If the regions are dirtied, do they need to be cleaned?
> > Does this hold for all trailing filesystem blocks in the (multipage)
> > folio, of just the one that spans the new EOF?

So this is handled the same way. There are no changes in above.
If there is a truncate which zeroes part of page beyond EOF, we don't do any I/O
for that straddle range beyond EOF. We only make that region zero in memory 
and limit the end_pos until i_size. This is taken care in iomap_do_writepage().

> > 
> > What happens with direct extent manipulation like fallocate()
> > operations? These invalidate the parts of the page cache over the
> > range we are punching, shifting, etc, without interacting directly
> > with iomap, so do we now have to ensure that the sub-folio dirty
> > regions are also invalidated correctly? i.e. do functions like
> > xfs_flush_unmap_range() need to become iomap infrastructure so that
> > they can update sub-folio dirty ranges correctly?
> 
> I'm slightly confused by this question.  As I understand the various
> fallocate operations, they start by kicking out all the folios affected
> by the operation (generally from the start of the operation to EOF),
> so we'd writeback the (dirty part of) folios which are dirty, then
> invalidate the folios in cache.  I'm not sure there's going to be
> much difference.

So I looked into this part.

For operations of file fallocate like hole punch, zero range or collapse,
XFS calls xfs_flush_unmap_range() which make calls to 
filemap_write_and_wait_range() followed by truncate_pagecache_range().
This should writeback all dirty ranges in the affected region followed by
unmap, folio_invalidate and then finally removing given folio from page cache. 
In folio_invalidate() we call aops->invalidate_folio == iomap_invalidate_folio()

So I think if we are tracking dirty state bitmap within iop we should unset the 
dirty bits for the given range here in iomap_invalidate_folio().
(This is not currently done in this patch).
Although the above will be the right thing to do theoretically, but I am not
sure of what operation could cause any problem even in case when we don't unset
the bits?

My understanding is no operation directly on file can sneak in between
filemap_write_and_wait_range() & truncate_pagecache_range(), which can make the
folio dirty. (since we have taken the inode->i_rwsem write lock). 
And for mmap write fault path (which could sneak in), in that we anyway mark the 
entire page as dirty, so for that the entire page is anyway going to be written out.

Any idea on what operations would require us to clear the dirty bits for the given 
subpage range in iomap_invalidate_folio() for it's correctness?

(Is it that folio_mark_dirty "external operations"... More on that below)

> 
> > What about the
> > folio_mark_dirty()/filemap_dirty_folio()/.folio_dirty()
> > infrastructure? iomap currently treats this as top down, so it
> > doesn't actually call back into iomap to mark filesystem blocks
> > dirty. This would need to be rearchitected to match
> > block_dirty_folio() where the bufferheads on the page are marked
> > dirty before the folio is marked dirty by external operations....

What are these "external operations"... continuing below

> 
> Yes.  This is also going to be a performance problem.  Marking a folio as
> dirty is no longer just setting the bit in struct folio and the xarray
> but also setting all the bits in iop->state.  

On setting the bits part, bitmap_** apis should generally optimize for cases 
when setting more than one bit though.

> Depending on the size
> of the folio, and the fs blocksize, this could be quite a lot of bits.
> eg a 2MB folio with a 1k block size is 2048 bits (256 bytes, 6 cachelines
> (it dirties the spinlock in cacheline 0, then the bitmap occupies 3 full
> cachelines and 2 partial ones)).

ok. the "state_lock" spinlock and "state[]" bitmaps within struct "iop_page".

> 
> I don't see the necessary churn from filemap_dirty_folio() to
> iomap_dirty_folio() as being a huge deal, but it's definitely a missing
> piece from this RFC.

Yes, this is what I am currently investigating. So what are those 
"external operations" which would require us to call into iomap to mark
all the bitmaps as dirty from within folio_mark_dirty()? 

I did look into callers of folio_mark_dirty(). But I am unable to put my head
around such operations, which can show me the coherency problem here.
i.e. why do we need to mark iomap_set_range_dirty() when calling
folio_mark_dirty().

So... when do we mark the dirty bitmaps of iop->state?

At two places - 
1. In case when the file_write_iter operation is called. This will call into
   iomap_write_iter -> iomap_write_end(). At this place before marking the
   folio as dirty, we will mark iomap_set_range_dirty() followed by
   filemap_dirty_folio().

2. Another case is VM write fault path. i.e. iomap_page_mkwrite(). 
   This will call iomap_folio_mkwrite_iter() for the given folio. In this we
   mark the entire range of the folio as dirty by calling iomap_set_range_dirty() 
   followed by folio_mark_dirty()

Now when do we clear these dirty bitmaps? 
1. We clear these bitmaps at the time of writeback. When writeback calls into
   iomap_writepage_map() for every folio. In this before submitting the
   ioend/bio, we will call iomap_clear_range_dirty() followed by
   folio_start_writeback().

Any idea which path can help me understand the reason on why should we mark 
iomap_set_range_dirty() when folio_mark_dirty() is called by "external operations". 
Some of such operations which I was investigating were - 
-> I couldn't find any in truncate/fallocate/hole punch/etc. 
-> process_vm_writev?
-> add_to_swap -> folio_mark_dirty()?
-> migrate?

And as I understand we are doing this to maintain coherency between say... if any
folio is marked as dirty, then it's corresponding iop->state bitmaps should also
be marked dirty. 

And then shoud be same for folio_cancel_dirty() too?

> 
> > The easy part of this problem is tracking dirty state on a
> > filesystem block boundaries. The *hard part* maintaining coherency
> > with the page cache, and none of that has been done yet. I'd prefer
> > that we deal with this problem once and for all at the page cache
> > level because multi-page folios mean even when the filesystem block
> > is the same as PAGE_SIZE, we have this sub-folio block granularity
> > tracking issue.
> > 

On the multi-folio / large folio thing. I will try and check more about it. 

So I could see the checks like "whether mapping supports large folio or not"
in iomap_write_begin(). But I am not sure the current status of that. 
(because even though we calculate "len" in iomap_write_begin() based on whether
mapping supports large folio or not, but we never pass "len" more than PAGE_SIZE
currently to iomap_write_begin() from iomap_write_iter(). 
Though it is true for iomap_zero_iter()).
So, I will spend sometime reading more on what FS operations does support 
large folio and it's current status to do more testing with large folio support.


- ritesh

> > As it is, we already have the capability for the mapping tree to
> > have multiple indexes pointing to the same folio - perhaps it's time
> > to start thinking about using filesystem blocks as the mapping tree
> > index rather than PAGE_SIZE chunks, so that the page cache can then
> > track dirty state on filesystem block boundaries natively and
> > this whole problem goes away. We have to solve this sub-folio dirty
> > tracking problem for multi-page folios anyway, so it seems to me
> > that we should solve the sub-page block size dirty tracking problem
> > the same way....
> 
> That's an interesting proposal.  From the page cache's point of
> view right now, there is only one dirty bit per folio, not per page.
> Anything you see contrary to that is old code that needs to be converted.
> So even indexing the page cache by block offset rather than page offset
> wouldn't help.
> 
> We have a number of people looking at the analogous problem for network
> filesystems right now.  Dave Howells' netfs infrastructure is trying
> to solve the problem for everyone (and he's been looking at iomap as
> inspiration for what he's doing).  I'm kind of hoping we end up with one
> unified solution that can be used for all filesystems that want sub-folio
> dirty tracking.  His solution is a bit more complex than I really want
> to see, at least partially because he's trying to track dirtiness at
> byte granularity, no matter how much pain that causes to the server.
Ritesh Harjani (IBM) Nov. 4, 2022, 2:15 p.m. UTC | #21
On 22/11/04 12:27AM, Christoph Hellwig wrote:
> On Wed, Nov 02, 2022 at 10:35:38AM -0700, Darrick J. Wong wrote:
> > Just so long as nobody imports that nonfeature of the bufferhead code
> > where dirtying an already dirty bufferhead skips marking the folio dirty
> > and writepage failures also fail to redirty the page.  That bit us hard
> > recently and I strongly prefer not to repeat that.
> 
> Yes, that absolutely needs to be avoided.

Sure, I am trying to discuss & investigate more in the same lines to avoid
the coherency problems (if any) in the other thread. 

i.e. to understand the rules between keeping the iomap->state[] dirty bitmap in 
sync with folio dirtiness (in folio_mark_dirty() / folio_cancel_dirty())
When are those required / what "external operations" could require 
folio_mark_dirty() to have a call to iomap_set_range_dirty() to dirty the
iop->state[] bitmaps.
Similary for marking it clean e.g. in folio_cancel_dirty().

In parallel I am looking into, to have a quick test case which could help me
replicate such scenario.

> 
> > > We can always optimize by having a bit for the fairly common all dirty
> > > case and only track and look at the array if that is no the case.
> > 
> > Yes, it would help to make the ranges in the bit array better defined
> > than the semi-opencoded logic there is now.  (I'm whining specifically
> > about the test_bit calls sprinkled around).
> 
> That is an absolutely requirement.  It was so obviosu to me that I
> didn't bother to restate it after two others already pointed it out :)

Yes. I will make those changes once rest of the things are sorted. 

> 
> > Once that's done it
> > shouldn't be hard to add one more bit for the all-dirty state.  Though
> > I'd want to see the numbers to prove that it saves us time anywhere.
> 
> We might be able to just use PG_private_2 in the page for now, but
> maybe just adding a flags field to the iomap_page might be a better
> idea, as the pageflags tend to have strange entanglements.

Sure, thanks for the suggestions. 

-ritesh
David Howells Nov. 7, 2022, 1:03 p.m. UTC | #22
Christoph Hellwig <hch@infradead.org> wrote:

> The core iomap code (fs/iomap/iter.c) does not.  Most users of it
> are block device centric right now, but for example the dax.c uses
> iomap for byte level DAX accesses without ever looking at a bdev,
> and seek.c and fiemap.c do not make any assumptions on the backend
> implementation.

Whilst that is true, what's in iter.c is extremely minimal and most definitely
not sufficient.  There's no retry logic, for example: what happens when we try
poking the cache and the cache says "no data"?  We have to go back and
redivide the missing bits of the request as the netfs granularity may not
match that of the cache.  Also, how to deal with writes that have to be
duplicated to multiple servers that don't all have the same wsize?

Then functions like iomap_read_folio(), iomap_readahead(), etc. *do* use
submit_bio().  These would seem like they're meant to be the main entry points
into iomap.

Add to that struct iomap_iter has two bdev pointers and two dax pointers and
the iomap_ioend struct assumes bio structs are going to be involved.

Also, there's struct iomap_page - I'm hoping to avoid the need for a dangly
struct on each page.  I *think* I only need an extra couple of bits per page
to discriminate between pages that need writing to the cache, pages that need
writing to the server, and pages that need to go to both - but I think it
might be possible to keep track of that in a separate list.  The vast majority
of write patterns are {open,write,write,...,write,close} and for such I just
need a single tracking struct.

David
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 255f9f92668c..31ee80a996b2 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -58,7 +58,7 @@  iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;
 
-	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
+	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
 		      gfp);
 	if (iop) {
 		spin_lock_init(&iop->state_lock);
@@ -168,6 +168,48 @@  static void iomap_set_range_uptodate(struct folio *folio,
 		folio_mark_uptodate(folio);
 }
 
+static void iomap_iop_set_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+	unsigned first = (off >> inode->i_blkbits) + nr_blocks;
+	unsigned last = ((off + len - 1) >> inode->i_blkbits) + nr_blocks;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	bitmap_set(iop->state, first, last - first + 1);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	if (iop)
+		iomap_iop_set_range_dirty(folio, iop, off, len);
+}
+
+static void iomap_iop_clear_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+	unsigned first = (off >> inode->i_blkbits) + nr_blocks;
+	unsigned last = ((off + len - 1) >> inode->i_blkbits) + nr_blocks;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	bitmap_clear(iop->state, first, last - first + 1);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	if (iop)
+		iomap_iop_clear_range_dirty(folio, iop, off, len);
+}
+
 static void iomap_finish_folio_read(struct folio *folio, size_t offset,
 		size_t len, int error)
 {
@@ -665,6 +707,7 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return 0;
 	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
+	iomap_set_range_dirty(folio, iop, offset_in_folio(folio, pos), len);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -979,6 +1022,8 @@  static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
 		block_commit_write(&folio->page, 0, length);
 	} else {
 		WARN_ON_ONCE(!folio_test_uptodate(folio));
+		iomap_set_range_dirty(folio, to_iomap_page(folio),
+				offset_in_folio(folio, iter->pos), length);
 		folio_mark_dirty(folio);
 	}
 
@@ -1354,7 +1399,8 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (iop && !test_bit(i, iop->state))
+		if (iop && (!test_bit(i, iop->state) ||
+			    !test_bit(i + nblocks, iop->state)))
 			continue;
 
 		error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1397,6 +1443,9 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 	}
 
+	iomap_clear_range_dirty(folio, iop,
+				offset_in_folio(folio, folio_pos(folio)),
+				end_pos - folio_pos(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);