diff mbox series

[RFCv3,7/7] iomap: Optimize data access patterns for filesystems with indirect mappings

Message ID 4e2752e99f55469c4eb5f2fe83e816d529110192.1714046808.git.ritesh.list@gmail.com (mailing list archive)
State Accepted, archived
Headers show
Series ext2 iomap changes and iomap improvements | expand

Commit Message

Ritesh Harjani (IBM) April 25, 2024, 1:28 p.m. UTC
This patch optimizes the data access patterns for filesystems with
indirect block mapping by implementing BH_Boundary handling within
iomap.

Currently the bios for reads within iomap are only submitted at
2 places -
1. If we cannot merge the new req. with previous bio, only then we
   submit the previous bio.
2. Submit the bio at the end of the entire read processing.

This means for filesystems with indirect block mapping, we call into
->iomap_begin() again w/o submitting the previous bios. That causes
unoptimized data access patterns for blocks which are of BH_Boundary type.

For e.g. consider the file mapping
logical block(4k) 		physical block(4k)
0-11 				1000-1011
12-15 				1013-1016

In above physical block 1012 is an indirect metadata block which has the
mapping information for next set of indirect blocks (1013-1016).
With iomap buffered reads for reading 1st 16 logical blocks of a file
(0-15), we get below I/O pattern
	- submit a bio for 1012
	- complete the bio for 1012
	- submit a bio for 1000-1011
	- submit a bio for 1013-1016
	- complete the bios for 1000-1011
	- complete the bios for 1013-1016

So as we can see, above is an non-optimal I/O access pattern and also we
get 3 bio completions instead of 2.

This patch changes this behavior by doing submit_bio() if there are any
bios already processed, before calling ->iomap_begin() again.
That means if there are any blocks which are already processed, gets
submitted for I/O earlier and then within ->iomap_begin(), if we get a
request for reading an indirect metadata block, then block layer can merge
those bios with the already submitted read request to reduce the no. of bio
completions.

Now, for bs < ps or for large folios, this patch requires proper handling
of "ifs->read_bytes_pending". In that we first set ifs->read_bytes_pending
to folio_size. Then handle all the cases where we need to subtract
ifs->read_bytes_pending either during the submission side
(if we don't need to submit any I/O - for e.g. for uptodate sub blocks),
or during an I/O error, or at the completion of an I/O.

Here is the ftrace output of iomap and block layer with ext2 iomap
conversion patches -

root# filefrag -b512 -v /mnt1/test/f1
Filesystem type is: ef53
Filesystem cylinder groups approximately 32
File size of /mnt1/test/f1 is 65536 (128 blocks of 512 bytes)
 ext:     logical_offset:        physical_offset: length:   expected: flags:
   0:        0..      95:      98304..     98399:     96:             merged
   1:       96..     127:      98408..     98439:     32:      98400: last,merged,eof
/mnt1/test/f1: 2 extents found

root# #This reads 4 blocks starting from lblk 10, 11, 12, 13
root# xfs_io -c "pread -b$((4*4096)) $((10*4096)) $((4*4096))" /mnt1/test/f1

w/o this patch - (indirect block is submitted before and does not get merged, resulting in 3 bios completion)
      xfs_io-907     [002] .....   185.608791: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
      xfs_io-907     [002] .....   185.608819: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
      xfs_io-907     [002] .....   185.608823: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
      xfs_io-907     [002] .....   185.608831: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
      xfs_io-907     [002] .....   185.608859: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
      xfs_io-907     [002] .....   185.608865: block_getrq: 8,16 R 98400 + 8 [xfs_io]
      xfs_io-907     [002] .....   185.608867: block_io_start: 8,16 R 4096 () 98400 + 8 [xfs_io]
      xfs_io-907     [002] .....   185.608869: block_plug: [xfs_io]
      xfs_io-907     [002] .....   185.608872: block_unplug: [xfs_io] 1
      xfs_io-907     [002] .....   185.608874: block_rq_insert: 8,16 R 4096 () 98400 + 8 [xfs_io]
kworker/2:1H-198     [002] .....   185.608908: block_rq_issue: 8,16 R 4096 () 98400 + 8 [kworker/2:1H]
      <idle>-0       [002] d.h2.   185.609579: block_rq_complete: 8,16 R () 98400 + 8 [0]
      <idle>-0       [002] dNh2.   185.609631: block_io_done: 8,16 R 0 () 98400 + 0 [swapper/2]
      xfs_io-907     [002] .....   185.609694: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
      xfs_io-907     [002] .....   185.609704: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609718: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609721: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609726: block_plug: [xfs_io]
      xfs_io-907     [002] .....   185.609735: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
      xfs_io-907     [002] .....   185.609736: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609740: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609741: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609756: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
      xfs_io-907     [002] .....   185.609769: block_rq_issue: 8,16 RA 8192 () 98384 + 16 [xfs_io]
      <idle>-0       [002] d.H2.   185.610280: block_rq_complete: 8,16 RA () 98408 + 16 [0]
      <idle>-0       [002] d.H2.   185.610289: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/2]
      <idle>-0       [002] d.H2.   185.610292: block_rq_complete: 8,16 RA () 98384 + 16 [0]
      <idle>-0       [002] dNH2.   185.610301: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/2]

v/s with the patch - (optimzed I/O access pattern and bio gets merged resulting in only 2 bios completion)
      xfs_io-944     [005] .....    99.926187: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
      xfs_io-944     [005] .....    99.926208: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
      xfs_io-944     [005] .....    99.926211: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
      xfs_io-944     [005] .....    99.926222: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
      xfs_io-944     [005] .....    99.926232: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
      xfs_io-944     [005] .....    99.926233: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
      xfs_io-944     [005] .....    99.926234: block_plug: [xfs_io]
      xfs_io-944     [005] .....    99.926235: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
      xfs_io-944     [005] .....    99.926261: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
      xfs_io-944     [005] .....    99.926266: block_bio_backmerge: 8,16 R 98400 + 8 [xfs_io]
      xfs_io-944     [005] .....    99.926271: block_unplug: [xfs_io] 1
      xfs_io-944     [005] .....    99.926272: block_rq_insert: 8,16 RA 12288 () 98384 + 24 [xfs_io]
kworker/5:1H-234     [005] .....    99.926314: block_rq_issue: 8,16 RA 12288 () 98384 + 24 [kworker/5:1H]
      <idle>-0       [005] d.h2.    99.926905: block_rq_complete: 8,16 RA () 98384 + 24 [0]
      <idle>-0       [005] dNh2.    99.926931: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/5]
      xfs_io-944     [005] .....    99.926971: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
      xfs_io-944     [005] .....    99.926981: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
      xfs_io-944     [005] .....    99.926989: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
      xfs_io-944     [005] .....    99.926989: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
      xfs_io-944     [005] .....    99.926991: block_plug: [xfs_io]
      xfs_io-944     [005] .....    99.926993: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
      xfs_io-944     [005] .....    99.927001: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
      <idle>-0       [005] d.h2.    99.927397: block_rq_complete: 8,16 RA () 98408 + 16 [0]
      <idle>-0       [005] dNh2.    99.927414: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/5]

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/iomap/buffered-io.c | 112 +++++++++++++++++++++++++++++++----------
 1 file changed, 85 insertions(+), 27 deletions(-)

--
2.44.0

Comments

Darrick J. Wong April 26, 2024, 4:24 p.m. UTC | #1
On Thu, Apr 25, 2024 at 06:58:51PM +0530, Ritesh Harjani (IBM) wrote:
> This patch optimizes the data access patterns for filesystems with
> indirect block mapping by implementing BH_Boundary handling within
> iomap.
> 
> Currently the bios for reads within iomap are only submitted at
> 2 places -
> 1. If we cannot merge the new req. with previous bio, only then we
>    submit the previous bio.
> 2. Submit the bio at the end of the entire read processing.
> 
> This means for filesystems with indirect block mapping, we call into
> ->iomap_begin() again w/o submitting the previous bios. That causes
> unoptimized data access patterns for blocks which are of BH_Boundary type.
> 
> For e.g. consider the file mapping
> logical block(4k) 		physical block(4k)
> 0-11 				1000-1011
> 12-15 				1013-1016
> 
> In above physical block 1012 is an indirect metadata block which has the
> mapping information for next set of indirect blocks (1013-1016).
> With iomap buffered reads for reading 1st 16 logical blocks of a file
> (0-15), we get below I/O pattern
> 	- submit a bio for 1012
> 	- complete the bio for 1012
> 	- submit a bio for 1000-1011
> 	- submit a bio for 1013-1016
> 	- complete the bios for 1000-1011
> 	- complete the bios for 1013-1016
> 
> So as we can see, above is an non-optimal I/O access pattern and also we
> get 3 bio completions instead of 2.
> 
> This patch changes this behavior by doing submit_bio() if there are any
> bios already processed, before calling ->iomap_begin() again.
> That means if there are any blocks which are already processed, gets
> submitted for I/O earlier and then within ->iomap_begin(), if we get a
> request for reading an indirect metadata block, then block layer can merge
> those bios with the already submitted read request to reduce the no. of bio
> completions.
> 
> Now, for bs < ps or for large folios, this patch requires proper handling
> of "ifs->read_bytes_pending". In that we first set ifs->read_bytes_pending
> to folio_size. Then handle all the cases where we need to subtract
> ifs->read_bytes_pending either during the submission side
> (if we don't need to submit any I/O - for e.g. for uptodate sub blocks),
> or during an I/O error, or at the completion of an I/O.
> 
> Here is the ftrace output of iomap and block layer with ext2 iomap
> conversion patches -
> 
> root# filefrag -b512 -v /mnt1/test/f1
> Filesystem type is: ef53
> Filesystem cylinder groups approximately 32
> File size of /mnt1/test/f1 is 65536 (128 blocks of 512 bytes)
>  ext:     logical_offset:        physical_offset: length:   expected: flags:
>    0:        0..      95:      98304..     98399:     96:             merged
>    1:       96..     127:      98408..     98439:     32:      98400: last,merged,eof
> /mnt1/test/f1: 2 extents found
> 
> root# #This reads 4 blocks starting from lblk 10, 11, 12, 13
> root# xfs_io -c "pread -b$((4*4096)) $((10*4096)) $((4*4096))" /mnt1/test/f1
> 
> w/o this patch - (indirect block is submitted before and does not get merged, resulting in 3 bios completion)
>       xfs_io-907     [002] .....   185.608791: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>       xfs_io-907     [002] .....   185.608819: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>       xfs_io-907     [002] .....   185.608823: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>       xfs_io-907     [002] .....   185.608831: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>       xfs_io-907     [002] .....   185.608859: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>       xfs_io-907     [002] .....   185.608865: block_getrq: 8,16 R 98400 + 8 [xfs_io]
>       xfs_io-907     [002] .....   185.608867: block_io_start: 8,16 R 4096 () 98400 + 8 [xfs_io]
>       xfs_io-907     [002] .....   185.608869: block_plug: [xfs_io]
>       xfs_io-907     [002] .....   185.608872: block_unplug: [xfs_io] 1
>       xfs_io-907     [002] .....   185.608874: block_rq_insert: 8,16 R 4096 () 98400 + 8 [xfs_io]
> kworker/2:1H-198     [002] .....   185.608908: block_rq_issue: 8,16 R 4096 () 98400 + 8 [kworker/2:1H]
>       <idle>-0       [002] d.h2.   185.609579: block_rq_complete: 8,16 R () 98400 + 8 [0]
>       <idle>-0       [002] dNh2.   185.609631: block_io_done: 8,16 R 0 () 98400 + 0 [swapper/2]
>       xfs_io-907     [002] .....   185.609694: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>       xfs_io-907     [002] .....   185.609704: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609718: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609721: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609726: block_plug: [xfs_io]
>       xfs_io-907     [002] .....   185.609735: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>       xfs_io-907     [002] .....   185.609736: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609740: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609741: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609756: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>       xfs_io-907     [002] .....   185.609769: block_rq_issue: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>       <idle>-0       [002] d.H2.   185.610280: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>       <idle>-0       [002] d.H2.   185.610289: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/2]
>       <idle>-0       [002] d.H2.   185.610292: block_rq_complete: 8,16 RA () 98384 + 16 [0]
>       <idle>-0       [002] dNH2.   185.610301: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/2]

Could this be shortened to ... the iomap calls and
block_bio_queue/backmerge?  It's a bit difficult to see the point you're
getting at with all the other noise.

I think you're trying to say that the access pattern here is 98400 ->
98408 -> 98384, which is not sequential?

> v/s with the patch - (optimzed I/O access pattern and bio gets merged resulting in only 2 bios completion)
>       xfs_io-944     [005] .....    99.926187: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>       xfs_io-944     [005] .....    99.926208: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>       xfs_io-944     [005] .....    99.926211: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>       xfs_io-944     [005] .....    99.926222: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>       xfs_io-944     [005] .....    99.926232: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>       xfs_io-944     [005] .....    99.926233: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>       xfs_io-944     [005] .....    99.926234: block_plug: [xfs_io]
>       xfs_io-944     [005] .....    99.926235: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>       xfs_io-944     [005] .....    99.926261: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>       xfs_io-944     [005] .....    99.926266: block_bio_backmerge: 8,16 R 98400 + 8 [xfs_io]
>       xfs_io-944     [005] .....    99.926271: block_unplug: [xfs_io] 1
>       xfs_io-944     [005] .....    99.926272: block_rq_insert: 8,16 RA 12288 () 98384 + 24 [xfs_io]
> kworker/5:1H-234     [005] .....    99.926314: block_rq_issue: 8,16 RA 12288 () 98384 + 24 [kworker/5:1H]
>       <idle>-0       [005] d.h2.    99.926905: block_rq_complete: 8,16 RA () 98384 + 24 [0]
>       <idle>-0       [005] dNh2.    99.926931: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/5]
>       xfs_io-944     [005] .....    99.926971: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>       xfs_io-944     [005] .....    99.926981: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>       xfs_io-944     [005] .....    99.926989: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>       xfs_io-944     [005] .....    99.926989: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>       xfs_io-944     [005] .....    99.926991: block_plug: [xfs_io]
>       xfs_io-944     [005] .....    99.926993: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>       xfs_io-944     [005] .....    99.927001: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>       <idle>-0       [005] d.h2.    99.927397: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>       <idle>-0       [005] dNh2.    99.927414: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/5]
> 
> Suggested-by: Matthew Wilcox <willy@infradead.org>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> ---
>  fs/iomap/buffered-io.c | 112 +++++++++++++++++++++++++++++++----------
>  1 file changed, 85 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 0a4269095ae2..a1d50086a3f5 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -30,7 +30,7 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>   */
>  struct iomap_folio_state {
>  	spinlock_t		state_lock;
> -	unsigned int		read_bytes_pending;
> +	size_t			read_bytes_pending;
>  	atomic_t		write_bytes_pending;
> 
>  	/*
> @@ -380,6 +380,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>  	loff_t orig_pos = pos;
>  	size_t poff, plen;
>  	sector_t sector;
> +	bool rbp_finished = false;

What is "rbp"?  My assembly programmer brain says x64 frame pointer, but
that's clearly wrong here.  Maybe I'm confused...

>  	if (iomap->type == IOMAP_INLINE)
>  		return iomap_read_inline_data(iter, folio);
> @@ -387,21 +388,39 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>  	/* zero post-eof blocks as the page may be mapped */
>  	ifs = ifs_alloc(iter->inode, folio, iter->flags);
>  	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
> +
> +	if (ifs) {
> +		loff_t to_read = min_t(loff_t, iter->len - offset,
> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
> +		size_t padjust;
> +
> +		spin_lock_irq(&ifs->state_lock);
> +		if (!ifs->read_bytes_pending)
> +			ifs->read_bytes_pending = to_read;
> +		padjust = pos - orig_pos;
> +		ifs->read_bytes_pending -= padjust;
> +		if (!ifs->read_bytes_pending)
> +			rbp_finished = true;
> +		spin_unlock_irq(&ifs->state_lock);
> +	}
> +
>  	if (plen == 0)
>  		goto done;
> 
>  	if (iomap_block_needs_zeroing(iter, pos)) {
> +		if (ifs) {
> +			spin_lock_irq(&ifs->state_lock);
> +			ifs->read_bytes_pending -= plen;
> +			if (!ifs->read_bytes_pending)
> +				rbp_finished = true;
> +			spin_unlock_irq(&ifs->state_lock);
> +		}
>  		folio_zero_range(folio, poff, plen);
>  		iomap_set_range_uptodate(folio, poff, plen);
>  		goto done;
>  	}
> 
>  	ctx->cur_folio_in_bio = true;
> -	if (ifs) {
> -		spin_lock_irq(&ifs->state_lock);
> -		ifs->read_bytes_pending += plen;
> -		spin_unlock_irq(&ifs->state_lock);
> -	}
> 
>  	sector = iomap_sector(iomap, pos);
>  	if (!ctx->bio ||
> @@ -435,6 +454,14 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>  	}
> 
>  done:
> +	/*
> +	 * If there is no bio prepared and if rbp is finished and
> +	 * this was the last offset within this folio then mark
> +	 * cur_folio_in_bio to false.
> +	 */
> +	if (!ctx->bio && rbp_finished &&
> +			offset_in_folio(folio, pos + plen) == 0)
> +		ctx->cur_folio_in_bio = false;

...yes, I think I am confused.  When would ctx->bio be NULL but
cur_folio_in_bio is true?

I /think/ what you're doing here is using read_bytes_pending to figure
out if you've processed the folio up to the end of the mapping?  But
then you submit the bio unconditionally below for each readpage_iter
call?

Why not add an IOMAP_BOUNDARY flag that means "I will have to do some IO
if you call ->iomap_begin again"?  Then if we get to this point in
readpage_iter with a ctx->bio, we can submit the bio, clear
cur_folio_in_bio, and return?  And then you don't need this machinery?

--D

>  	/*
>  	 * Move the caller beyond our range so that it keeps making progress.
>  	 * For that, we have to include any leading non-uptodate ranges, but
> @@ -459,9 +486,43 @@ static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
>  			return ret;
>  	}
> 
> +	if (ctx->bio) {
> +		submit_bio(ctx->bio);
> +		WARN_ON_ONCE(!ctx->cur_folio_in_bio);
> +		ctx->bio = NULL;
> +	}
> +	if (offset_in_folio(folio, iter->pos + done) == 0 &&
> +			!ctx->cur_folio_in_bio) {
> +		folio_unlock(ctx->cur_folio);
> +	}
> +
>  	return done;
>  }
> 
> +static void iomap_handle_read_error(struct iomap_readpage_ctx *ctx,
> +		struct iomap_iter *iter)
> +{
> +	struct folio *folio = ctx->cur_folio;
> +	struct iomap_folio_state *ifs;
> +	unsigned long flags;
> +	bool rbp_finished = false;
> +	size_t rbp_adjust = folio_size(folio) - offset_in_folio(folio,
> +				iter->pos);
> +	ifs = folio->private;
> +	if (!ifs || !ifs->read_bytes_pending)
> +		goto unlock;
> +
> +	spin_lock_irqsave(&ifs->state_lock, flags);
> +	ifs->read_bytes_pending -= rbp_adjust;
> +	if (!ifs->read_bytes_pending)
> +		rbp_finished = true;
> +	spin_unlock_irqrestore(&ifs->state_lock, flags);
> +
> +unlock:
> +	if (rbp_finished || !ctx->cur_folio_in_bio)
> +		folio_unlock(folio);
> +}
> +
>  int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
>  {
>  	struct iomap_iter iter = {
> @@ -479,15 +540,9 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
>  	while ((ret = iomap_iter(&iter, ops)) > 0)
>  		iter.processed = iomap_read_folio_iter(&iter, &ctx);
> 
> -	if (ret < 0)
> +	if (ret < 0) {
>  		folio_set_error(folio);
> -
> -	if (ctx.bio) {
> -		submit_bio(ctx.bio);
> -		WARN_ON_ONCE(!ctx.cur_folio_in_bio);
> -	} else {
> -		WARN_ON_ONCE(ctx.cur_folio_in_bio);
> -		folio_unlock(folio);
> +		iomap_handle_read_error(&ctx, &iter);
>  	}
> 
>  	/*
> @@ -506,12 +561,6 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
>  	loff_t done, ret;
> 
>  	for (done = 0; done < length; done += ret) {
> -		if (ctx->cur_folio &&
> -		    offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
> -			if (!ctx->cur_folio_in_bio)
> -				folio_unlock(ctx->cur_folio);
> -			ctx->cur_folio = NULL;
> -		}
>  		if (!ctx->cur_folio) {
>  			ctx->cur_folio = readahead_folio(ctx->rac);
>  			ctx->cur_folio_in_bio = false;
> @@ -519,6 +568,17 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
>  		ret = iomap_readpage_iter(iter, ctx, done);
>  		if (ret <= 0)
>  			return ret;
> +		if (ctx->cur_folio && offset_in_folio(ctx->cur_folio,
> +					iter->pos + done + ret) == 0) {
> +			if (!ctx->cur_folio_in_bio)
> +				folio_unlock(ctx->cur_folio);
> +			ctx->cur_folio = NULL;
> +		}
> +	}
> +
> +	if (ctx->bio) {
> +		submit_bio(ctx->bio);
> +		ctx->bio = NULL;
>  	}
> 
>  	return done;
> @@ -549,18 +609,16 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
>  	struct iomap_readpage_ctx ctx = {
>  		.rac	= rac,
>  	};
> +	int ret = 0;
> 
>  	trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
> 
> -	while (iomap_iter(&iter, ops) > 0)
> +	while ((ret = iomap_iter(&iter, ops)) > 0)
>  		iter.processed = iomap_readahead_iter(&iter, &ctx);
> 
> -	if (ctx.bio)
> -		submit_bio(ctx.bio);
> -	if (ctx.cur_folio) {
> -		if (!ctx.cur_folio_in_bio)
> -			folio_unlock(ctx.cur_folio);
> -	}
> +	if (ret < 0 && ctx.cur_folio)
> +		iomap_handle_read_error(&ctx, &iter);
> +
>  }
>  EXPORT_SYMBOL_GPL(iomap_readahead);
> 
> --
> 2.44.0
> 
>
Ritesh Harjani (IBM) April 26, 2024, 5:17 p.m. UTC | #2
"Darrick J. Wong" <djwong@kernel.org> writes:

> On Thu, Apr 25, 2024 at 06:58:51PM +0530, Ritesh Harjani (IBM) wrote:
>> This patch optimizes the data access patterns for filesystems with
>> indirect block mapping by implementing BH_Boundary handling within
>> iomap.
>> 
>> Currently the bios for reads within iomap are only submitted at
>> 2 places -
>> 1. If we cannot merge the new req. with previous bio, only then we
>>    submit the previous bio.
>> 2. Submit the bio at the end of the entire read processing.
>> 
>> This means for filesystems with indirect block mapping, we call into
>> ->iomap_begin() again w/o submitting the previous bios. That causes
>> unoptimized data access patterns for blocks which are of BH_Boundary type.
>> 
>> For e.g. consider the file mapping
>> logical block(4k) 		physical block(4k)
>> 0-11 				1000-1011
>> 12-15 				1013-1016
>> 
>> In above physical block 1012 is an indirect metadata block which has the
>> mapping information for next set of indirect blocks (1013-1016).
>> With iomap buffered reads for reading 1st 16 logical blocks of a file
>> (0-15), we get below I/O pattern
>> 	- submit a bio for 1012
>> 	- complete the bio for 1012
>> 	- submit a bio for 1000-1011
>> 	- submit a bio for 1013-1016
>> 	- complete the bios for 1000-1011
>> 	- complete the bios for 1013-1016
>> 
>> So as we can see, above is an non-optimal I/O access pattern and also we
>> get 3 bio completions instead of 2.
>> 
>> This patch changes this behavior by doing submit_bio() if there are any
>> bios already processed, before calling ->iomap_begin() again.
>> That means if there are any blocks which are already processed, gets
>> submitted for I/O earlier and then within ->iomap_begin(), if we get a
>> request for reading an indirect metadata block, then block layer can merge
>> those bios with the already submitted read request to reduce the no. of bio
>> completions.
>> 
>> Now, for bs < ps or for large folios, this patch requires proper handling
>> of "ifs->read_bytes_pending". In that we first set ifs->read_bytes_pending
>> to folio_size. Then handle all the cases where we need to subtract
>> ifs->read_bytes_pending either during the submission side
>> (if we don't need to submit any I/O - for e.g. for uptodate sub blocks),
>> or during an I/O error, or at the completion of an I/O.
>> 
>> Here is the ftrace output of iomap and block layer with ext2 iomap
>> conversion patches -
>> 
>> root# filefrag -b512 -v /mnt1/test/f1
>> Filesystem type is: ef53
>> Filesystem cylinder groups approximately 32
>> File size of /mnt1/test/f1 is 65536 (128 blocks of 512 bytes)
>>  ext:     logical_offset:        physical_offset: length:   expected: flags:
>>    0:        0..      95:      98304..     98399:     96:             merged
>>    1:       96..     127:      98408..     98439:     32:      98400: last,merged,eof
>> /mnt1/test/f1: 2 extents found
>> 
>> root# #This reads 4 blocks starting from lblk 10, 11, 12, 13
>> root# xfs_io -c "pread -b$((4*4096)) $((10*4096)) $((4*4096))" /mnt1/test/f1
>> 
>> w/o this patch - (indirect block is submitted before and does not get merged, resulting in 3 bios completion)
>>       xfs_io-907     [002] .....   185.608791: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>>       xfs_io-907     [002] .....   185.608819: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>>       xfs_io-907     [002] .....   185.608823: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>>       xfs_io-907     [002] .....   185.608831: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>>       xfs_io-907     [002] .....   185.608859: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>>       xfs_io-907     [002] .....   185.608865: block_getrq: 8,16 R 98400 + 8 [xfs_io]
>>       xfs_io-907     [002] .....   185.608867: block_io_start: 8,16 R 4096 () 98400 + 8 [xfs_io]
>>       xfs_io-907     [002] .....   185.608869: block_plug: [xfs_io]
>>       xfs_io-907     [002] .....   185.608872: block_unplug: [xfs_io] 1
>>       xfs_io-907     [002] .....   185.608874: block_rq_insert: 8,16 R 4096 () 98400 + 8 [xfs_io]
>> kworker/2:1H-198     [002] .....   185.608908: block_rq_issue: 8,16 R 4096 () 98400 + 8 [kworker/2:1H]
>>       <idle>-0       [002] d.h2.   185.609579: block_rq_complete: 8,16 R () 98400 + 8 [0]
>>       <idle>-0       [002] dNh2.   185.609631: block_io_done: 8,16 R 0 () 98400 + 0 [swapper/2]
>>       xfs_io-907     [002] .....   185.609694: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>>       xfs_io-907     [002] .....   185.609704: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609718: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609721: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609726: block_plug: [xfs_io]
>>       xfs_io-907     [002] .....   185.609735: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>>       xfs_io-907     [002] .....   185.609736: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609740: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609741: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609756: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>       xfs_io-907     [002] .....   185.609769: block_rq_issue: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>>       <idle>-0       [002] d.H2.   185.610280: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>>       <idle>-0       [002] d.H2.   185.610289: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/2]
>>       <idle>-0       [002] d.H2.   185.610292: block_rq_complete: 8,16 RA () 98384 + 16 [0]
>>       <idle>-0       [002] dNH2.   185.610301: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/2]
>
> Could this be shortened to ... the iomap calls and
> block_bio_queue/backmerge?  It's a bit difficult to see the point you're
> getting at with all the other noise.

I will remove this log and move it to cover letter and will just extend
the simple example I considered before in this commit message,
to show the difference with and w/o patch.

>
> I think you're trying to say that the access pattern here is 98400 ->
> 98408 -> 98384, which is not sequential?
>

it's (98400,8 ==> metadata block) -> (98384,16 == lblk 10 & 11) -> (98408,16 ==> lblk 12 & 13)
... w/o the patch

>> v/s with the patch - (optimzed I/O access pattern and bio gets merged resulting in only 2 bios completion)
>>       xfs_io-944     [005] .....    99.926187: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>>       xfs_io-944     [005] .....    99.926208: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>>       xfs_io-944     [005] .....    99.926211: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>>       xfs_io-944     [005] .....    99.926222: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>>       xfs_io-944     [005] .....    99.926232: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>>       xfs_io-944     [005] .....    99.926233: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>>       xfs_io-944     [005] .....    99.926234: block_plug: [xfs_io]
>>       xfs_io-944     [005] .....    99.926235: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>>       xfs_io-944     [005] .....    99.926261: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>>       xfs_io-944     [005] .....    99.926266: block_bio_backmerge: 8,16 R 98400 + 8 [xfs_io]
>>       xfs_io-944     [005] .....    99.926271: block_unplug: [xfs_io] 1
>>       xfs_io-944     [005] .....    99.926272: block_rq_insert: 8,16 RA 12288 () 98384 + 24 [xfs_io]
>> kworker/5:1H-234     [005] .....    99.926314: block_rq_issue: 8,16 RA 12288 () 98384 + 24 [kworker/5:1H]
>>       <idle>-0       [005] d.h2.    99.926905: block_rq_complete: 8,16 RA () 98384 + 24 [0]
>>       <idle>-0       [005] dNh2.    99.926931: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/5]
>>       xfs_io-944     [005] .....    99.926971: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>>       xfs_io-944     [005] .....    99.926981: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>>       xfs_io-944     [005] .....    99.926989: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>>       xfs_io-944     [005] .....    99.926989: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>       xfs_io-944     [005] .....    99.926991: block_plug: [xfs_io]
>>       xfs_io-944     [005] .....    99.926993: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>>       xfs_io-944     [005] .....    99.927001: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>       <idle>-0       [005] d.h2.    99.927397: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>>       <idle>-0       [005] dNh2.    99.927414: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/5]
>> 
>> Suggested-by: Matthew Wilcox <willy@infradead.org>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>> ---
>>  fs/iomap/buffered-io.c | 112 +++++++++++++++++++++++++++++++----------
>>  1 file changed, 85 insertions(+), 27 deletions(-)
>> 
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 0a4269095ae2..a1d50086a3f5 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -30,7 +30,7 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>>   */
>>  struct iomap_folio_state {
>>  	spinlock_t		state_lock;
>> -	unsigned int		read_bytes_pending;
>> +	size_t			read_bytes_pending;
>>  	atomic_t		write_bytes_pending;
>> 
>>  	/*
>> @@ -380,6 +380,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>  	loff_t orig_pos = pos;
>>  	size_t poff, plen;
>>  	sector_t sector;
>> +	bool rbp_finished = false;
>
> What is "rbp"?  My assembly programmer brain says x64 frame pointer, but
> that's clearly wrong here.  Maybe I'm confused...
>

rbp == read_bytes_pending ;)

>>  	if (iomap->type == IOMAP_INLINE)
>>  		return iomap_read_inline_data(iter, folio);
>> @@ -387,21 +388,39 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>  	/* zero post-eof blocks as the page may be mapped */
>>  	ifs = ifs_alloc(iter->inode, folio, iter->flags);
>>  	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
>> +
>> +	if (ifs) {
>> +		loff_t to_read = min_t(loff_t, iter->len - offset,
>> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
>> +		size_t padjust;
>> +
>> +		spin_lock_irq(&ifs->state_lock);
>> +		if (!ifs->read_bytes_pending)
>> +			ifs->read_bytes_pending = to_read;
>> +		padjust = pos - orig_pos;
>> +		ifs->read_bytes_pending -= padjust;
>> +		if (!ifs->read_bytes_pending)
>> +			rbp_finished = true;
>> +		spin_unlock_irq(&ifs->state_lock);
>> +	}
>> +
>>  	if (plen == 0)
>>  		goto done;
>> 
>>  	if (iomap_block_needs_zeroing(iter, pos)) {
>> +		if (ifs) {
>> +			spin_lock_irq(&ifs->state_lock);
>> +			ifs->read_bytes_pending -= plen;
>> +			if (!ifs->read_bytes_pending)
>> +				rbp_finished = true;
>> +			spin_unlock_irq(&ifs->state_lock);
>> +		}
>>  		folio_zero_range(folio, poff, plen);
>>  		iomap_set_range_uptodate(folio, poff, plen);
>>  		goto done;
>>  	}
>> 
>>  	ctx->cur_folio_in_bio = true;
>> -	if (ifs) {
>> -		spin_lock_irq(&ifs->state_lock);
>> -		ifs->read_bytes_pending += plen;
>> -		spin_unlock_irq(&ifs->state_lock);
>> -	}
>> 
>>  	sector = iomap_sector(iomap, pos);
>>  	if (!ctx->bio ||
>> @@ -435,6 +454,14 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>  	}
>> 
>>  done:
>> +	/*
>> +	 * If there is no bio prepared and if rbp is finished and
>> +	 * this was the last offset within this folio then mark
>> +	 * cur_folio_in_bio to false.
>> +	 */
>> +	if (!ctx->bio && rbp_finished &&
>> +			offset_in_folio(folio, pos + plen) == 0)
>> +		ctx->cur_folio_in_bio = false;
>
> ...yes, I think I am confused.  When would ctx->bio be NULL but
> cur_folio_in_bio is true?

Previously we had the bio submitted and so we make it null, but we still
have ctx->cur_folio & ctx->cur_folio_in_bio to true, since we haven't
completely processed the folio.

>
> I /think/ what you're doing here is using read_bytes_pending to figure
> out if you've processed the folio up to the end of the mapping?  But
> then you submit the bio unconditionally below for each readpage_iter
> call?
>

yes, that's right.

> Why not add an IOMAP_BOUNDARY flag that means "I will have to do some IO
> if you call ->iomap_begin again"?  Then if we get to this point in
> readpage_iter with a ctx->bio, we can submit the bio, clear
> cur_folio_in_bio, and return?  And then you don't need this machinery?

TBH, I initially didn't think the approach taken in the patch would
require such careful handling of r_b_p. It was because of all of this
corner cases when we don't need to read the update blocks and/or in case
of an error we need to ensure we reduce r_b_p carefully so that we could
unlock the folio and when extent spans beyond i_size.

So it's all about how do we know if we could unlock the folio and that it's
corresponding blocks/mapping has been all processed or submitted for
I/O. 

Assume we have a folio which spans over multiple extents. In such a
case, 
-> we process a bio for 1st extent, 
-> then we go back to iomap_iter() to get new extent mapping, 
-> We now increment the r_b_p with this new plen to be processed. 
-> We then submit the previous bio, since this new mapping couldn't be
merged due to discontinuous extents. 
So by first incrementing the r_b_p before doing submit_bio(), we don't
unlock the folio at bio completion.

Maybe, it would be helpful if we have an easy mechanism to keep some state
from the time of submit_bio() till the bio completion to know that the
corresponding folio is still being processed and it shouldn't be
unlocked.
 -> This currently is what we are doing by making r_b_p to the value of
 folio_size() and then carefully reducing r_b_p for all the cases I
 mentioned above.

Let me think if adding a IOMAP_BH_BOUNDARY flag could be helpful or not.
Say if we have a pagesize of 64k that means all first 16 blocks belongs
to same page. So even with IOMAP_BH_BOUNDARY flag the problem that still
remains is that, even if we submit the bio at block 11 (bh_boundary
block), how will the bio completion side know that the folio is not
completely processed and so we shouldn't unlock the folio?


-ritesh
Ritesh Harjani (IBM) April 26, 2024, 5:25 p.m. UTC | #3
Ritesh Harjani (IBM) <ritesh.list@gmail.com> writes:

> "Darrick J. Wong" <djwong@kernel.org> writes:
>
>> On Thu, Apr 25, 2024 at 06:58:51PM +0530, Ritesh Harjani (IBM) wrote:
>>> This patch optimizes the data access patterns for filesystems with
>>> indirect block mapping by implementing BH_Boundary handling within
>>> iomap.
>>> 
>>> Currently the bios for reads within iomap are only submitted at
>>> 2 places -
>>> 1. If we cannot merge the new req. with previous bio, only then we
>>>    submit the previous bio.
>>> 2. Submit the bio at the end of the entire read processing.
>>> 
>>> This means for filesystems with indirect block mapping, we call into
>>> ->iomap_begin() again w/o submitting the previous bios. That causes
>>> unoptimized data access patterns for blocks which are of BH_Boundary type.
>>> 
>>> For e.g. consider the file mapping
>>> logical block(4k) 		physical block(4k)
>>> 0-11 				1000-1011
>>> 12-15 				1013-1016
>>> 
>>> In above physical block 1012 is an indirect metadata block which has the
>>> mapping information for next set of indirect blocks (1013-1016).
>>> With iomap buffered reads for reading 1st 16 logical blocks of a file
>>> (0-15), we get below I/O pattern
>>> 	- submit a bio for 1012
>>> 	- complete the bio for 1012
>>> 	- submit a bio for 1000-1011
>>> 	- submit a bio for 1013-1016
>>> 	- complete the bios for 1000-1011
>>> 	- complete the bios for 1013-1016
>>> 
>>> So as we can see, above is an non-optimal I/O access pattern and also we
>>> get 3 bio completions instead of 2.
>>> 
>>> This patch changes this behavior by doing submit_bio() if there are any
>>> bios already processed, before calling ->iomap_begin() again.
>>> That means if there are any blocks which are already processed, gets
>>> submitted for I/O earlier and then within ->iomap_begin(), if we get a
>>> request for reading an indirect metadata block, then block layer can merge
>>> those bios with the already submitted read request to reduce the no. of bio
>>> completions.
>>> 
>>> Now, for bs < ps or for large folios, this patch requires proper handling
>>> of "ifs->read_bytes_pending". In that we first set ifs->read_bytes_pending
>>> to folio_size. Then handle all the cases where we need to subtract
>>> ifs->read_bytes_pending either during the submission side
>>> (if we don't need to submit any I/O - for e.g. for uptodate sub blocks),
>>> or during an I/O error, or at the completion of an I/O.
>>> 
>>> Here is the ftrace output of iomap and block layer with ext2 iomap
>>> conversion patches -
>>> 
>>> root# filefrag -b512 -v /mnt1/test/f1
>>> Filesystem type is: ef53
>>> Filesystem cylinder groups approximately 32
>>> File size of /mnt1/test/f1 is 65536 (128 blocks of 512 bytes)
>>>  ext:     logical_offset:        physical_offset: length:   expected: flags:
>>>    0:        0..      95:      98304..     98399:     96:             merged
>>>    1:       96..     127:      98408..     98439:     32:      98400: last,merged,eof
>>> /mnt1/test/f1: 2 extents found
>>> 
>>> root# #This reads 4 blocks starting from lblk 10, 11, 12, 13
>>> root# xfs_io -c "pread -b$((4*4096)) $((10*4096)) $((4*4096))" /mnt1/test/f1
>>> 
>>> w/o this patch - (indirect block is submitted before and does not get merged, resulting in 3 bios completion)
>>>       xfs_io-907     [002] .....   185.608791: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>>>       xfs_io-907     [002] .....   185.608819: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>>>       xfs_io-907     [002] .....   185.608823: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>>>       xfs_io-907     [002] .....   185.608831: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>>>       xfs_io-907     [002] .....   185.608859: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>>>       xfs_io-907     [002] .....   185.608865: block_getrq: 8,16 R 98400 + 8 [xfs_io]
>>>       xfs_io-907     [002] .....   185.608867: block_io_start: 8,16 R 4096 () 98400 + 8 [xfs_io]
>>>       xfs_io-907     [002] .....   185.608869: block_plug: [xfs_io]
>>>       xfs_io-907     [002] .....   185.608872: block_unplug: [xfs_io] 1
>>>       xfs_io-907     [002] .....   185.608874: block_rq_insert: 8,16 R 4096 () 98400 + 8 [xfs_io]
>>> kworker/2:1H-198     [002] .....   185.608908: block_rq_issue: 8,16 R 4096 () 98400 + 8 [kworker/2:1H]
>>>       <idle>-0       [002] d.h2.   185.609579: block_rq_complete: 8,16 R () 98400 + 8 [0]
>>>       <idle>-0       [002] dNh2.   185.609631: block_io_done: 8,16 R 0 () 98400 + 0 [swapper/2]
>>>       xfs_io-907     [002] .....   185.609694: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>>>       xfs_io-907     [002] .....   185.609704: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609718: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609721: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609726: block_plug: [xfs_io]
>>>       xfs_io-907     [002] .....   185.609735: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>>>       xfs_io-907     [002] .....   185.609736: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609740: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609741: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609756: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>>       xfs_io-907     [002] .....   185.609769: block_rq_issue: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>>>       <idle>-0       [002] d.H2.   185.610280: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>>>       <idle>-0       [002] d.H2.   185.610289: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/2]
>>>       <idle>-0       [002] d.H2.   185.610292: block_rq_complete: 8,16 RA () 98384 + 16 [0]
>>>       <idle>-0       [002] dNH2.   185.610301: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/2]
>>
>> Could this be shortened to ... the iomap calls and
>> block_bio_queue/backmerge?  It's a bit difficult to see the point you're
>> getting at with all the other noise.
>
> I will remove this log and move it to cover letter and will just extend
> the simple example I considered before in this commit message,
> to show the difference with and w/o patch.
>
>>
>> I think you're trying to say that the access pattern here is 98400 ->
>> 98408 -> 98384, which is not sequential?
>>
>
> it's (98400,8 ==> metadata block) -> (98384,16 == lblk 10 & 11) -> (98408,16 ==> lblk 12 & 13)
> ... w/o the patch
>
>>> v/s with the patch - (optimzed I/O access pattern and bio gets merged resulting in only 2 bios completion)
>>>       xfs_io-944     [005] .....    99.926187: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>>>       xfs_io-944     [005] .....    99.926208: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>>>       xfs_io-944     [005] .....    99.926211: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>>>       xfs_io-944     [005] .....    99.926222: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926232: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926233: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926234: block_plug: [xfs_io]
>>>       xfs_io-944     [005] .....    99.926235: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>>>       xfs_io-944     [005] .....    99.926261: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926266: block_bio_backmerge: 8,16 R 98400 + 8 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926271: block_unplug: [xfs_io] 1
>>>       xfs_io-944     [005] .....    99.926272: block_rq_insert: 8,16 RA 12288 () 98384 + 24 [xfs_io]
>>> kworker/5:1H-234     [005] .....    99.926314: block_rq_issue: 8,16 RA 12288 () 98384 + 24 [kworker/5:1H]
>>>       <idle>-0       [005] d.h2.    99.926905: block_rq_complete: 8,16 RA () 98384 + 24 [0]
>>>       <idle>-0       [005] dNh2.    99.926931: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/5]
>>>       xfs_io-944     [005] .....    99.926971: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>>>       xfs_io-944     [005] .....    99.926981: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926989: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926989: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>>       xfs_io-944     [005] .....    99.926991: block_plug: [xfs_io]
>>>       xfs_io-944     [005] .....    99.926993: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>>>       xfs_io-944     [005] .....    99.927001: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>>>       <idle>-0       [005] d.h2.    99.927397: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>>>       <idle>-0       [005] dNh2.    99.927414: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/5]
>>> 
>>> Suggested-by: Matthew Wilcox <willy@infradead.org>
>>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>>> cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>>> ---
>>>  fs/iomap/buffered-io.c | 112 +++++++++++++++++++++++++++++++----------
>>>  1 file changed, 85 insertions(+), 27 deletions(-)
>>> 
>>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>>> index 0a4269095ae2..a1d50086a3f5 100644
>>> --- a/fs/iomap/buffered-io.c
>>> +++ b/fs/iomap/buffered-io.c
>>> @@ -30,7 +30,7 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>>>   */
>>>  struct iomap_folio_state {
>>>  	spinlock_t		state_lock;
>>> -	unsigned int		read_bytes_pending;
>>> +	size_t			read_bytes_pending;
>>>  	atomic_t		write_bytes_pending;
>>> 
>>>  	/*
>>> @@ -380,6 +380,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>>  	loff_t orig_pos = pos;
>>>  	size_t poff, plen;
>>>  	sector_t sector;
>>> +	bool rbp_finished = false;
>>
>> What is "rbp"?  My assembly programmer brain says x64 frame pointer, but
>> that's clearly wrong here.  Maybe I'm confused...
>>
>
> rbp == read_bytes_pending ;)
>
>>>  	if (iomap->type == IOMAP_INLINE)
>>>  		return iomap_read_inline_data(iter, folio);
>>> @@ -387,21 +388,39 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>>  	/* zero post-eof blocks as the page may be mapped */
>>>  	ifs = ifs_alloc(iter->inode, folio, iter->flags);
>>>  	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
>>> +
>>> +	if (ifs) {
>>> +		loff_t to_read = min_t(loff_t, iter->len - offset,
>>> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
>>> +		size_t padjust;
>>> +
>>> +		spin_lock_irq(&ifs->state_lock);
>>> +		if (!ifs->read_bytes_pending)
>>> +			ifs->read_bytes_pending = to_read;
>>> +		padjust = pos - orig_pos;
>>> +		ifs->read_bytes_pending -= padjust;
>>> +		if (!ifs->read_bytes_pending)
>>> +			rbp_finished = true;
>>> +		spin_unlock_irq(&ifs->state_lock);
>>> +	}
>>> +
>>>  	if (plen == 0)
>>>  		goto done;
>>> 
>>>  	if (iomap_block_needs_zeroing(iter, pos)) {
>>> +		if (ifs) {
>>> +			spin_lock_irq(&ifs->state_lock);
>>> +			ifs->read_bytes_pending -= plen;
>>> +			if (!ifs->read_bytes_pending)
>>> +				rbp_finished = true;
>>> +			spin_unlock_irq(&ifs->state_lock);
>>> +		}
>>>  		folio_zero_range(folio, poff, plen);
>>>  		iomap_set_range_uptodate(folio, poff, plen);
>>>  		goto done;
>>>  	}
>>> 
>>>  	ctx->cur_folio_in_bio = true;
>>> -	if (ifs) {
>>> -		spin_lock_irq(&ifs->state_lock);
>>> -		ifs->read_bytes_pending += plen;
>>> -		spin_unlock_irq(&ifs->state_lock);
>>> -	}
>>> 
>>>  	sector = iomap_sector(iomap, pos);
>>>  	if (!ctx->bio ||
>>> @@ -435,6 +454,14 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>>  	}
>>> 
>>>  done:
>>> +	/*
>>> +	 * If there is no bio prepared and if rbp is finished and
>>> +	 * this was the last offset within this folio then mark
>>> +	 * cur_folio_in_bio to false.
>>> +	 */
>>> +	if (!ctx->bio && rbp_finished &&
>>> +			offset_in_folio(folio, pos + plen) == 0)
>>> +		ctx->cur_folio_in_bio = false;
>>
>> ...yes, I think I am confused.  When would ctx->bio be NULL but
>> cur_folio_in_bio is true?
>
> Previously we had the bio submitted and so we make it null, but we still
> have ctx->cur_folio & ctx->cur_folio_in_bio to true, since we haven't
> completely processed the folio.
>
>>
>> I /think/ what you're doing here is using read_bytes_pending to figure
>> out if you've processed the folio up to the end of the mapping?  But
>> then you submit the bio unconditionally below for each readpage_iter
>> call?
>>
>
> yes, that's right.
>
>> Why not add an IOMAP_BOUNDARY flag that means "I will have to do some IO
>> if you call ->iomap_begin again"?  Then if we get to this point in
>> readpage_iter with a ctx->bio, we can submit the bio, clear
>> cur_folio_in_bio, and return?  And then you don't need this machinery?
>
> TBH, I initially didn't think the approach taken in the patch would
> require such careful handling of r_b_p. It was because of all of this
> corner cases when we don't need to read the update blocks and/or in case
> of an error we need to ensure we reduce r_b_p carefully so that we could
> unlock the folio and when extent spans beyond i_size.
>
> So it's all about how do we know if we could unlock the folio and that it's
> corresponding blocks/mapping has been all processed or submitted for
> I/O. 
>
> Assume we have a folio which spans over multiple extents. In such a
> case, 
> -> we process a bio for 1st extent, 
> -> then we go back to iomap_iter() to get new extent mapping, 
> -> We now increment the r_b_p with this new plen to be processed. 
> -> We then submit the previous bio, since this new mapping couldn't be
> merged due to discontinuous extents. 
> So by first incrementing the r_b_p before doing submit_bio(), we don't
> unlock the folio at bio completion.
>
> Maybe, it would be helpful if we have an easy mechanism to keep some state
> from the time of submit_bio() till the bio completion to know that the
> corresponding folio is still being processed and it shouldn't be
> unlocked.
>  -> This currently is what we are doing by making r_b_p to the value of
>  folio_size() and then carefully reducing r_b_p for all the cases I
>  mentioned above.
>
> Let me think if adding a IOMAP_BH_BOUNDARY flag could be helpful or not.
> Say if we have a pagesize of 64k that means all first 16 blocks belongs
> to same page. So even with IOMAP_BH_BOUNDARY flag the problem that still
> remains is that, even if we submit the bio at block 11 (bh_boundary
> block), how will the bio completion side know that the folio is not
> completely processed and so we shouldn't unlock the folio?

Maybe one way could be if we could add another state flag to ifs for
BH_BOUNDARY block and read that at the bio completion.
We can then also let the completion side know if it should unlock the
folio or whether it still needs processing at the submission side.

I guess that might be an easier approach then this. Thoughts?

-ritesh
Matthew Wilcox April 26, 2024, 5:37 p.m. UTC | #4
On Fri, Apr 26, 2024 at 10:55:23PM +0530, Ritesh Harjani wrote:
> Ritesh Harjani (IBM) <ritesh.list@gmail.com> writes:
> 
> > "Darrick J. Wong" <djwong@kernel.org> writes:
> >
> >> On Thu, Apr 25, 2024 at 06:58:51PM +0530, Ritesh Harjani (IBM) wrote:
> >>> This patch optimizes the data access patterns for filesystems with
> >>> indirect block mapping by implementing BH_Boundary handling within
> >>> iomap.
> >>> 
> >>> Currently the bios for reads within iomap are only submitted at
> >>> 2 places -
> >>> 1. If we cannot merge the new req. with previous bio, only then we
> >>>    submit the previous bio.
> >>> 2. Submit the bio at the end of the entire read processing.
> >>> 
> >>> This means for filesystems with indirect block mapping, we call into
> >>> ->iomap_begin() again w/o submitting the previous bios. That causes
> >>> unoptimized data access patterns for blocks which are of BH_Boundary type.
> >>> 
> >>> For e.g. consider the file mapping
> >>> logical block(4k) 		physical block(4k)
> >>> 0-11 				1000-1011
> >>> 12-15 				1013-1016
> >>> 
> >>> In above physical block 1012 is an indirect metadata block which has the
> >>> mapping information for next set of indirect blocks (1013-1016).
> >>> With iomap buffered reads for reading 1st 16 logical blocks of a file
> >>> (0-15), we get below I/O pattern
> >>> 	- submit a bio for 1012
> >>> 	- complete the bio for 1012
> >>> 	- submit a bio for 1000-1011
> >>> 	- submit a bio for 1013-1016
> >>> 	- complete the bios for 1000-1011
> >>> 	- complete the bios for 1013-1016
> >>> 
> >>> So as we can see, above is an non-optimal I/O access pattern and also we
> >>> get 3 bio completions instead of 2.
> >>> 
> >>> This patch changes this behavior by doing submit_bio() if there are any
> >>> bios already processed, before calling ->iomap_begin() again.
> >>> That means if there are any blocks which are already processed, gets
> >>> submitted for I/O earlier and then within ->iomap_begin(), if we get a
> >>> request for reading an indirect metadata block, then block layer can merge
> >>> those bios with the already submitted read request to reduce the no. of bio
> >>> completions.
> >>> 
> >>> Now, for bs < ps or for large folios, this patch requires proper handling
> >>> of "ifs->read_bytes_pending". In that we first set ifs->read_bytes_pending
> >>> to folio_size. Then handle all the cases where we need to subtract
> >>> ifs->read_bytes_pending either during the submission side
> >>> (if we don't need to submit any I/O - for e.g. for uptodate sub blocks),
> >>> or during an I/O error, or at the completion of an I/O.
> >>> 
> >>> Here is the ftrace output of iomap and block layer with ext2 iomap
> >>> conversion patches -
> >>> 
> >>> root# filefrag -b512 -v /mnt1/test/f1
> >>> Filesystem type is: ef53
> >>> Filesystem cylinder groups approximately 32
> >>> File size of /mnt1/test/f1 is 65536 (128 blocks of 512 bytes)
> >>>  ext:     logical_offset:        physical_offset: length:   expected: flags:
> >>>    0:        0..      95:      98304..     98399:     96:             merged
> >>>    1:       96..     127:      98408..     98439:     32:      98400: last,merged,eof
> >>> /mnt1/test/f1: 2 extents found
> >>> 
> >>> root# #This reads 4 blocks starting from lblk 10, 11, 12, 13
> >>> root# xfs_io -c "pread -b$((4*4096)) $((10*4096)) $((4*4096))" /mnt1/test/f1
> >>> 
> >>> w/o this patch - (indirect block is submitted before and does not get merged, resulting in 3 bios completion)
> >>>       xfs_io-907     [002] .....   185.608791: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
> >>>       xfs_io-907     [002] .....   185.608819: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
> >>>       xfs_io-907     [002] .....   185.608823: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
> >>>       xfs_io-907     [002] .....   185.608831: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
> >>>       xfs_io-907     [002] .....   185.608859: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.608865: block_getrq: 8,16 R 98400 + 8 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.608867: block_io_start: 8,16 R 4096 () 98400 + 8 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.608869: block_plug: [xfs_io]
> >>>       xfs_io-907     [002] .....   185.608872: block_unplug: [xfs_io] 1
> >>>       xfs_io-907     [002] .....   185.608874: block_rq_insert: 8,16 R 4096 () 98400 + 8 [xfs_io]
> >>> kworker/2:1H-198     [002] .....   185.608908: block_rq_issue: 8,16 R 4096 () 98400 + 8 [kworker/2:1H]
> >>>       <idle>-0       [002] d.h2.   185.609579: block_rq_complete: 8,16 R () 98400 + 8 [0]
> >>>       <idle>-0       [002] dNh2.   185.609631: block_io_done: 8,16 R 0 () 98400 + 0 [swapper/2]
> >>>       xfs_io-907     [002] .....   185.609694: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
> >>>       xfs_io-907     [002] .....   185.609704: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609718: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609721: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609726: block_plug: [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609735: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
> >>>       xfs_io-907     [002] .....   185.609736: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609740: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609741: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609756: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
> >>>       xfs_io-907     [002] .....   185.609769: block_rq_issue: 8,16 RA 8192 () 98384 + 16 [xfs_io]
> >>>       <idle>-0       [002] d.H2.   185.610280: block_rq_complete: 8,16 RA () 98408 + 16 [0]
> >>>       <idle>-0       [002] d.H2.   185.610289: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/2]
> >>>       <idle>-0       [002] d.H2.   185.610292: block_rq_complete: 8,16 RA () 98384 + 16 [0]
> >>>       <idle>-0       [002] dNH2.   185.610301: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/2]
> >>
> >> Could this be shortened to ... the iomap calls and
> >> block_bio_queue/backmerge?  It's a bit difficult to see the point you're
> >> getting at with all the other noise.
> >
> > I will remove this log and move it to cover letter and will just extend
> > the simple example I considered before in this commit message,
> > to show the difference with and w/o patch.
> >
> >>
> >> I think you're trying to say that the access pattern here is 98400 ->
> >> 98408 -> 98384, which is not sequential?
> >>
> >
> > it's (98400,8 ==> metadata block) -> (98384,16 == lblk 10 & 11) -> (98408,16 ==> lblk 12 & 13)
> > ... w/o the patch
> >
> >>> v/s with the patch - (optimzed I/O access pattern and bio gets merged resulting in only 2 bios completion)
> >>>       xfs_io-944     [005] .....    99.926187: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
> >>>       xfs_io-944     [005] .....    99.926208: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
> >>>       xfs_io-944     [005] .....    99.926211: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
> >>>       xfs_io-944     [005] .....    99.926222: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926232: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926233: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926234: block_plug: [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926235: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
> >>>       xfs_io-944     [005] .....    99.926261: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926266: block_bio_backmerge: 8,16 R 98400 + 8 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926271: block_unplug: [xfs_io] 1
> >>>       xfs_io-944     [005] .....    99.926272: block_rq_insert: 8,16 RA 12288 () 98384 + 24 [xfs_io]
> >>> kworker/5:1H-234     [005] .....    99.926314: block_rq_issue: 8,16 RA 12288 () 98384 + 24 [kworker/5:1H]
> >>>       <idle>-0       [005] d.h2.    99.926905: block_rq_complete: 8,16 RA () 98384 + 24 [0]
> >>>       <idle>-0       [005] dNh2.    99.926931: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/5]
> >>>       xfs_io-944     [005] .....    99.926971: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
> >>>       xfs_io-944     [005] .....    99.926981: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926989: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926989: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926991: block_plug: [xfs_io]
> >>>       xfs_io-944     [005] .....    99.926993: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
> >>>       xfs_io-944     [005] .....    99.927001: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
> >>>       <idle>-0       [005] d.h2.    99.927397: block_rq_complete: 8,16 RA () 98408 + 16 [0]
> >>>       <idle>-0       [005] dNh2.    99.927414: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/5]
> >>> 
> >>> Suggested-by: Matthew Wilcox <willy@infradead.org>
> >>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> >>> cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> >>> ---
> >>>  fs/iomap/buffered-io.c | 112 +++++++++++++++++++++++++++++++----------
> >>>  1 file changed, 85 insertions(+), 27 deletions(-)
> >>> 
> >>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> >>> index 0a4269095ae2..a1d50086a3f5 100644
> >>> --- a/fs/iomap/buffered-io.c
> >>> +++ b/fs/iomap/buffered-io.c
> >>> @@ -30,7 +30,7 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
> >>>   */
> >>>  struct iomap_folio_state {
> >>>  	spinlock_t		state_lock;
> >>> -	unsigned int		read_bytes_pending;
> >>> +	size_t			read_bytes_pending;
> >>>  	atomic_t		write_bytes_pending;
> >>> 
> >>>  	/*
> >>> @@ -380,6 +380,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> >>>  	loff_t orig_pos = pos;
> >>>  	size_t poff, plen;
> >>>  	sector_t sector;
> >>> +	bool rbp_finished = false;
> >>
> >> What is "rbp"?  My assembly programmer brain says x64 frame pointer, but
> >> that's clearly wrong here.  Maybe I'm confused...
> >>
> >
> > rbp == read_bytes_pending ;)
> >
> >>>  	if (iomap->type == IOMAP_INLINE)
> >>>  		return iomap_read_inline_data(iter, folio);
> >>> @@ -387,21 +388,39 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> >>>  	/* zero post-eof blocks as the page may be mapped */
> >>>  	ifs = ifs_alloc(iter->inode, folio, iter->flags);
> >>>  	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
> >>> +
> >>> +	if (ifs) {
> >>> +		loff_t to_read = min_t(loff_t, iter->len - offset,
> >>> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
> >>> +		size_t padjust;
> >>> +
> >>> +		spin_lock_irq(&ifs->state_lock);
> >>> +		if (!ifs->read_bytes_pending)
> >>> +			ifs->read_bytes_pending = to_read;
> >>> +		padjust = pos - orig_pos;
> >>> +		ifs->read_bytes_pending -= padjust;
> >>> +		if (!ifs->read_bytes_pending)
> >>> +			rbp_finished = true;
> >>> +		spin_unlock_irq(&ifs->state_lock);
> >>> +	}
> >>> +
> >>>  	if (plen == 0)
> >>>  		goto done;
> >>> 
> >>>  	if (iomap_block_needs_zeroing(iter, pos)) {
> >>> +		if (ifs) {
> >>> +			spin_lock_irq(&ifs->state_lock);
> >>> +			ifs->read_bytes_pending -= plen;
> >>> +			if (!ifs->read_bytes_pending)
> >>> +				rbp_finished = true;
> >>> +			spin_unlock_irq(&ifs->state_lock);
> >>> +		}
> >>>  		folio_zero_range(folio, poff, plen);
> >>>  		iomap_set_range_uptodate(folio, poff, plen);
> >>>  		goto done;
> >>>  	}
> >>> 
> >>>  	ctx->cur_folio_in_bio = true;
> >>> -	if (ifs) {
> >>> -		spin_lock_irq(&ifs->state_lock);
> >>> -		ifs->read_bytes_pending += plen;
> >>> -		spin_unlock_irq(&ifs->state_lock);
> >>> -	}
> >>> 
> >>>  	sector = iomap_sector(iomap, pos);
> >>>  	if (!ctx->bio ||
> >>> @@ -435,6 +454,14 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> >>>  	}
> >>> 
> >>>  done:
> >>> +	/*
> >>> +	 * If there is no bio prepared and if rbp is finished and
> >>> +	 * this was the last offset within this folio then mark
> >>> +	 * cur_folio_in_bio to false.
> >>> +	 */
> >>> +	if (!ctx->bio && rbp_finished &&
> >>> +			offset_in_folio(folio, pos + plen) == 0)
> >>> +		ctx->cur_folio_in_bio = false;
> >>
> >> ...yes, I think I am confused.  When would ctx->bio be NULL but
> >> cur_folio_in_bio is true?
> >
> > Previously we had the bio submitted and so we make it null, but we still
> > have ctx->cur_folio & ctx->cur_folio_in_bio to true, since we haven't
> > completely processed the folio.
> >
> >>
> >> I /think/ what you're doing here is using read_bytes_pending to figure
> >> out if you've processed the folio up to the end of the mapping?  But
> >> then you submit the bio unconditionally below for each readpage_iter
> >> call?
> >>
> >
> > yes, that's right.
> >
> >> Why not add an IOMAP_BOUNDARY flag that means "I will have to do some IO
> >> if you call ->iomap_begin again"?  Then if we get to this point in
> >> readpage_iter with a ctx->bio, we can submit the bio, clear
> >> cur_folio_in_bio, and return?  And then you don't need this machinery?
> >
> > TBH, I initially didn't think the approach taken in the patch would
> > require such careful handling of r_b_p. It was because of all of this
> > corner cases when we don't need to read the update blocks and/or in case
> > of an error we need to ensure we reduce r_b_p carefully so that we could
> > unlock the folio and when extent spans beyond i_size.
> >
> > So it's all about how do we know if we could unlock the folio and that it's
> > corresponding blocks/mapping has been all processed or submitted for
> > I/O. 
> >
> > Assume we have a folio which spans over multiple extents. In such a
> > case, 
> > -> we process a bio for 1st extent, 
> > -> then we go back to iomap_iter() to get new extent mapping, 
> > -> We now increment the r_b_p with this new plen to be processed. 
> > -> We then submit the previous bio, since this new mapping couldn't be
> > merged due to discontinuous extents. 
> > So by first incrementing the r_b_p before doing submit_bio(), we don't
> > unlock the folio at bio completion.
> >
> > Maybe, it would be helpful if we have an easy mechanism to keep some state
> > from the time of submit_bio() till the bio completion to know that the
> > corresponding folio is still being processed and it shouldn't be
> > unlocked.
> >  -> This currently is what we are doing by making r_b_p to the value of
> >  folio_size() and then carefully reducing r_b_p for all the cases I
> >  mentioned above.
> >
> > Let me think if adding a IOMAP_BH_BOUNDARY flag could be helpful or not.
> > Say if we have a pagesize of 64k that means all first 16 blocks belongs
> > to same page. So even with IOMAP_BH_BOUNDARY flag the problem that still
> > remains is that, even if we submit the bio at block 11 (bh_boundary
> > block), how will the bio completion side know that the folio is not
> > completely processed and so we shouldn't unlock the folio?
> 
> Maybe one way could be if we could add another state flag to ifs for
> BH_BOUNDARY block and read that at the bio completion.
> We can then also let the completion side know if it should unlock the
> folio or whether it still needs processing at the submission side.

The approach I suggested was to initialise read_bytes_pending to
folio_size() at the start.  Then subtract off blocksize for each
uptodate block, whether you find it already uptodate, or as the
completion handler runs.

Is there a reason that doesn't work?
Ritesh Harjani (IBM) April 26, 2024, 5:55 p.m. UTC | #5
Matthew Wilcox <willy@infradead.org> writes:

> On Fri, Apr 26, 2024 at 10:55:23PM +0530, Ritesh Harjani wrote:
>> Ritesh Harjani (IBM) <ritesh.list@gmail.com> writes:
>> 
>> > "Darrick J. Wong" <djwong@kernel.org> writes:
>> >
>> >> On Thu, Apr 25, 2024 at 06:58:51PM +0530, Ritesh Harjani (IBM) wrote:
>> >>> This patch optimizes the data access patterns for filesystems with
>> >>> indirect block mapping by implementing BH_Boundary handling within
>> >>> iomap.
>> >>> 
>> >>> Currently the bios for reads within iomap are only submitted at
>> >>> 2 places -
>> >>> 1. If we cannot merge the new req. with previous bio, only then we
>> >>>    submit the previous bio.
>> >>> 2. Submit the bio at the end of the entire read processing.
>> >>> 
>> >>> This means for filesystems with indirect block mapping, we call into
>> >>> ->iomap_begin() again w/o submitting the previous bios. That causes
>> >>> unoptimized data access patterns for blocks which are of BH_Boundary type.
>> >>> 
>> >>> For e.g. consider the file mapping
>> >>> logical block(4k) 		physical block(4k)
>> >>> 0-11 				1000-1011
>> >>> 12-15 				1013-1016
>> >>> 
>> >>> In above physical block 1012 is an indirect metadata block which has the
>> >>> mapping information for next set of indirect blocks (1013-1016).
>> >>> With iomap buffered reads for reading 1st 16 logical blocks of a file
>> >>> (0-15), we get below I/O pattern
>> >>> 	- submit a bio for 1012
>> >>> 	- complete the bio for 1012
>> >>> 	- submit a bio for 1000-1011
>> >>> 	- submit a bio for 1013-1016
>> >>> 	- complete the bios for 1000-1011
>> >>> 	- complete the bios for 1013-1016
>> >>> 
>> >>> So as we can see, above is an non-optimal I/O access pattern and also we
>> >>> get 3 bio completions instead of 2.
>> >>> 
>> >>> This patch changes this behavior by doing submit_bio() if there are any
>> >>> bios already processed, before calling ->iomap_begin() again.
>> >>> That means if there are any blocks which are already processed, gets
>> >>> submitted for I/O earlier and then within ->iomap_begin(), if we get a
>> >>> request for reading an indirect metadata block, then block layer can merge
>> >>> those bios with the already submitted read request to reduce the no. of bio
>> >>> completions.
>> >>> 
>> >>> Now, for bs < ps or for large folios, this patch requires proper handling
>> >>> of "ifs->read_bytes_pending". In that we first set ifs->read_bytes_pending
>> >>> to folio_size. Then handle all the cases where we need to subtract
>> >>> ifs->read_bytes_pending either during the submission side
>> >>> (if we don't need to submit any I/O - for e.g. for uptodate sub blocks),
>> >>> or during an I/O error, or at the completion of an I/O.
>> >>> 
>> >>> Here is the ftrace output of iomap and block layer with ext2 iomap
>> >>> conversion patches -
>> >>> 
>> >>> root# filefrag -b512 -v /mnt1/test/f1
>> >>> Filesystem type is: ef53
>> >>> Filesystem cylinder groups approximately 32
>> >>> File size of /mnt1/test/f1 is 65536 (128 blocks of 512 bytes)
>> >>>  ext:     logical_offset:        physical_offset: length:   expected: flags:
>> >>>    0:        0..      95:      98304..     98399:     96:             merged
>> >>>    1:       96..     127:      98408..     98439:     32:      98400: last,merged,eof
>> >>> /mnt1/test/f1: 2 extents found
>> >>> 
>> >>> root# #This reads 4 blocks starting from lblk 10, 11, 12, 13
>> >>> root# xfs_io -c "pread -b$((4*4096)) $((10*4096)) $((4*4096))" /mnt1/test/f1
>> >>> 
>> >>> w/o this patch - (indirect block is submitted before and does not get merged, resulting in 3 bios completion)
>> >>>       xfs_io-907     [002] .....   185.608791: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>> >>>       xfs_io-907     [002] .....   185.608819: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>> >>>       xfs_io-907     [002] .....   185.608823: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>> >>>       xfs_io-907     [002] .....   185.608831: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>> >>>       xfs_io-907     [002] .....   185.608859: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.608865: block_getrq: 8,16 R 98400 + 8 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.608867: block_io_start: 8,16 R 4096 () 98400 + 8 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.608869: block_plug: [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.608872: block_unplug: [xfs_io] 1
>> >>>       xfs_io-907     [002] .....   185.608874: block_rq_insert: 8,16 R 4096 () 98400 + 8 [xfs_io]
>> >>> kworker/2:1H-198     [002] .....   185.608908: block_rq_issue: 8,16 R 4096 () 98400 + 8 [kworker/2:1H]
>> >>>       <idle>-0       [002] d.h2.   185.609579: block_rq_complete: 8,16 R () 98400 + 8 [0]
>> >>>       <idle>-0       [002] dNh2.   185.609631: block_io_done: 8,16 R 0 () 98400 + 0 [swapper/2]
>> >>>       xfs_io-907     [002] .....   185.609694: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>> >>>       xfs_io-907     [002] .....   185.609704: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609718: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609721: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609726: block_plug: [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609735: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1e1/0x2c0
>> >>>       xfs_io-907     [002] .....   185.609736: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609740: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609741: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609756: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>> >>>       xfs_io-907     [002] .....   185.609769: block_rq_issue: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>> >>>       <idle>-0       [002] d.H2.   185.610280: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>> >>>       <idle>-0       [002] d.H2.   185.610289: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/2]
>> >>>       <idle>-0       [002] d.H2.   185.610292: block_rq_complete: 8,16 RA () 98384 + 16 [0]
>> >>>       <idle>-0       [002] dNH2.   185.610301: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/2]
>> >>
>> >> Could this be shortened to ... the iomap calls and
>> >> block_bio_queue/backmerge?  It's a bit difficult to see the point you're
>> >> getting at with all the other noise.
>> >
>> > I will remove this log and move it to cover letter and will just extend
>> > the simple example I considered before in this commit message,
>> > to show the difference with and w/o patch.
>> >
>> >>
>> >> I think you're trying to say that the access pattern here is 98400 ->
>> >> 98408 -> 98384, which is not sequential?
>> >>
>> >
>> > it's (98400,8 ==> metadata block) -> (98384,16 == lblk 10 & 11) -> (98408,16 ==> lblk 12 & 13)
>> > ... w/o the patch
>> >
>> >>> v/s with the patch - (optimzed I/O access pattern and bio gets merged resulting in only 2 bios completion)
>> >>>       xfs_io-944     [005] .....    99.926187: iomap_readahead: dev 8:16 ino 0xc nr_pages 4
>> >>>       xfs_io-944     [005] .....    99.926208: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x4000 processed 0 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x9d/0x2c0
>> >>>       xfs_io-944     [005] .....    99.926211: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300a000 offset 0xa000 length 0x2000 type MAPPED flags MERGED
>> >>>       xfs_io-944     [005] .....    99.926222: block_bio_queue: 8,16 RA 98384 + 16 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926232: block_getrq: 8,16 RA 98384 + 16 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926233: block_io_start: 8,16 RA 8192 () 98384 + 16 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926234: block_plug: [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926235: iomap_iter: dev 8:16 ino 0xc pos 0xa000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>> >>>       xfs_io-944     [005] .....    99.926261: block_bio_queue: 8,16 R 98400 + 8 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926266: block_bio_backmerge: 8,16 R 98400 + 8 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926271: block_unplug: [xfs_io] 1
>> >>>       xfs_io-944     [005] .....    99.926272: block_rq_insert: 8,16 RA 12288 () 98384 + 24 [xfs_io]
>> >>> kworker/5:1H-234     [005] .....    99.926314: block_rq_issue: 8,16 RA 12288 () 98384 + 24 [kworker/5:1H]
>> >>>       <idle>-0       [005] d.h2.    99.926905: block_rq_complete: 8,16 RA () 98384 + 24 [0]
>> >>>       <idle>-0       [005] dNh2.    99.926931: block_io_done: 8,16 RA 0 () 98384 + 0 [swapper/5]
>> >>>       xfs_io-944     [005] .....    99.926971: iomap_iter_dstmap: dev 8:16 ino 0xc bdev 8:16 addr 0x300d000 offset 0xc000 length 0x2000 type MAPPED flags MERGED
>> >>>       xfs_io-944     [005] .....    99.926981: block_bio_queue: 8,16 RA 98408 + 16 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926989: block_getrq: 8,16 RA 98408 + 16 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926989: block_io_start: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926991: block_plug: [xfs_io]
>> >>>       xfs_io-944     [005] .....    99.926993: iomap_iter: dev 8:16 ino 0xc pos 0xc000 length 0x2000 processed 8192 flags  (0x0) ops 0xffffffff82242160 caller iomap_readahead+0x1f9/0x2c0
>> >>>       xfs_io-944     [005] .....    99.927001: block_rq_issue: 8,16 RA 8192 () 98408 + 16 [xfs_io]
>> >>>       <idle>-0       [005] d.h2.    99.927397: block_rq_complete: 8,16 RA () 98408 + 16 [0]
>> >>>       <idle>-0       [005] dNh2.    99.927414: block_io_done: 8,16 RA 0 () 98408 + 0 [swapper/5]
>> >>> 
>> >>> Suggested-by: Matthew Wilcox <willy@infradead.org>
>> >>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> >>> cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>> >>> ---
>> >>>  fs/iomap/buffered-io.c | 112 +++++++++++++++++++++++++++++++----------
>> >>>  1 file changed, 85 insertions(+), 27 deletions(-)
>> >>> 
>> >>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> >>> index 0a4269095ae2..a1d50086a3f5 100644
>> >>> --- a/fs/iomap/buffered-io.c
>> >>> +++ b/fs/iomap/buffered-io.c
>> >>> @@ -30,7 +30,7 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
>> >>>   */
>> >>>  struct iomap_folio_state {
>> >>>  	spinlock_t		state_lock;
>> >>> -	unsigned int		read_bytes_pending;
>> >>> +	size_t			read_bytes_pending;
>> >>>  	atomic_t		write_bytes_pending;
>> >>> 
>> >>>  	/*
>> >>> @@ -380,6 +380,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>> >>>  	loff_t orig_pos = pos;
>> >>>  	size_t poff, plen;
>> >>>  	sector_t sector;
>> >>> +	bool rbp_finished = false;
>> >>
>> >> What is "rbp"?  My assembly programmer brain says x64 frame pointer, but
>> >> that's clearly wrong here.  Maybe I'm confused...
>> >>
>> >
>> > rbp == read_bytes_pending ;)
>> >
>> >>>  	if (iomap->type == IOMAP_INLINE)
>> >>>  		return iomap_read_inline_data(iter, folio);
>> >>> @@ -387,21 +388,39 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>> >>>  	/* zero post-eof blocks as the page may be mapped */
>> >>>  	ifs = ifs_alloc(iter->inode, folio, iter->flags);
>> >>>  	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
>> >>> +
>> >>> +	if (ifs) {
>> >>> +		loff_t to_read = min_t(loff_t, iter->len - offset,
>> >>> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
>> >>> +		size_t padjust;
>> >>> +
>> >>> +		spin_lock_irq(&ifs->state_lock);
>> >>> +		if (!ifs->read_bytes_pending)
>> >>> +			ifs->read_bytes_pending = to_read;
>> >>> +		padjust = pos - orig_pos;
>> >>> +		ifs->read_bytes_pending -= padjust;
>> >>> +		if (!ifs->read_bytes_pending)
>> >>> +			rbp_finished = true;
>> >>> +		spin_unlock_irq(&ifs->state_lock);
>> >>> +	}
>> >>> +
>> >>>  	if (plen == 0)
>> >>>  		goto done;
>> >>> 
>> >>>  	if (iomap_block_needs_zeroing(iter, pos)) {
>> >>> +		if (ifs) {
>> >>> +			spin_lock_irq(&ifs->state_lock);
>> >>> +			ifs->read_bytes_pending -= plen;
>> >>> +			if (!ifs->read_bytes_pending)
>> >>> +				rbp_finished = true;
>> >>> +			spin_unlock_irq(&ifs->state_lock);
>> >>> +		}
>> >>>  		folio_zero_range(folio, poff, plen);
>> >>>  		iomap_set_range_uptodate(folio, poff, plen);
>> >>>  		goto done;
>> >>>  	}
>> >>> 
>> >>>  	ctx->cur_folio_in_bio = true;
>> >>> -	if (ifs) {
>> >>> -		spin_lock_irq(&ifs->state_lock);
>> >>> -		ifs->read_bytes_pending += plen;
>> >>> -		spin_unlock_irq(&ifs->state_lock);
>> >>> -	}
>> >>> 
>> >>>  	sector = iomap_sector(iomap, pos);
>> >>>  	if (!ctx->bio ||
>> >>> @@ -435,6 +454,14 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>> >>>  	}
>> >>> 
>> >>>  done:
>> >>> +	/*
>> >>> +	 * If there is no bio prepared and if rbp is finished and
>> >>> +	 * this was the last offset within this folio then mark
>> >>> +	 * cur_folio_in_bio to false.
>> >>> +	 */
>> >>> +	if (!ctx->bio && rbp_finished &&
>> >>> +			offset_in_folio(folio, pos + plen) == 0)
>> >>> +		ctx->cur_folio_in_bio = false;
>> >>
>> >> ...yes, I think I am confused.  When would ctx->bio be NULL but
>> >> cur_folio_in_bio is true?
>> >
>> > Previously we had the bio submitted and so we make it null, but we still
>> > have ctx->cur_folio & ctx->cur_folio_in_bio to true, since we haven't
>> > completely processed the folio.
>> >
>> >>
>> >> I /think/ what you're doing here is using read_bytes_pending to figure
>> >> out if you've processed the folio up to the end of the mapping?  But
>> >> then you submit the bio unconditionally below for each readpage_iter
>> >> call?
>> >>
>> >
>> > yes, that's right.
>> >
>> >> Why not add an IOMAP_BOUNDARY flag that means "I will have to do some IO
>> >> if you call ->iomap_begin again"?  Then if we get to this point in
>> >> readpage_iter with a ctx->bio, we can submit the bio, clear
>> >> cur_folio_in_bio, and return?  And then you don't need this machinery?
>> >
>> > TBH, I initially didn't think the approach taken in the patch would
>> > require such careful handling of r_b_p. It was because of all of this
>> > corner cases when we don't need to read the update blocks and/or in case
>> > of an error we need to ensure we reduce r_b_p carefully so that we could
>> > unlock the folio and when extent spans beyond i_size.
>> >
>> > So it's all about how do we know if we could unlock the folio and that it's
>> > corresponding blocks/mapping has been all processed or submitted for
>> > I/O. 
>> >
>> > Assume we have a folio which spans over multiple extents. In such a
>> > case, 
>> > -> we process a bio for 1st extent, 
>> > -> then we go back to iomap_iter() to get new extent mapping, 
>> > -> We now increment the r_b_p with this new plen to be processed. 
>> > -> We then submit the previous bio, since this new mapping couldn't be
>> > merged due to discontinuous extents. 
>> > So by first incrementing the r_b_p before doing submit_bio(), we don't
>> > unlock the folio at bio completion.
>> >
>> > Maybe, it would be helpful if we have an easy mechanism to keep some state
>> > from the time of submit_bio() till the bio completion to know that the
>> > corresponding folio is still being processed and it shouldn't be
>> > unlocked.
>> >  -> This currently is what we are doing by making r_b_p to the value of
>> >  folio_size() and then carefully reducing r_b_p for all the cases I
>> >  mentioned above.
>> >
>> > Let me think if adding a IOMAP_BH_BOUNDARY flag could be helpful or not.
>> > Say if we have a pagesize of 64k that means all first 16 blocks belongs
>> > to same page. So even with IOMAP_BH_BOUNDARY flag the problem that still
>> > remains is that, even if we submit the bio at block 11 (bh_boundary
>> > block), how will the bio completion side know that the folio is not
>> > completely processed and so we shouldn't unlock the folio?
>> 
>> Maybe one way could be if we could add another state flag to ifs for
>> BH_BOUNDARY block and read that at the bio completion.
>> We can then also let the completion side know if it should unlock the
>> folio or whether it still needs processing at the submission side.
>
> The approach I suggested was to initialise read_bytes_pending to
> folio_size() at the start.  Then subtract off blocksize for each
> uptodate block, whether you find it already uptodate, or as the
> completion handler runs.
>
> Is there a reason that doesn't work?

That is what this patch series does right. The current patch does work
as far as my testing goes.

For e.g. This is what initializes the r_b_p for the first time when
ifs->r_b_p is 0.

+		loff_t to_read = min_t(loff_t, iter->len - offset,
+			folio_size(folio) - offset_in_folio(folio, orig_pos));
<..>
+		if (!ifs->read_bytes_pending)
+			ifs->read_bytes_pending = to_read;


Then this is where we subtract r_b_p for blocks which are uptodate.
+		padjust = pos - orig_pos;
+		ifs->read_bytes_pending -= padjust;


This is when we adjust r_b_p when we directly zero the folio.
 	if (iomap_block_needs_zeroing(iter, pos)) {
+		if (ifs) {
+			spin_lock_irq(&ifs->state_lock);
+			ifs->read_bytes_pending -= plen;
+			if (!ifs->read_bytes_pending)
+				rbp_finished = true;
+			spin_unlock_irq(&ifs->state_lock);
+		}

But as you see this requires surgery throughout read paths. What if
we add a state flag to ifs only for BH_BOUNDARY. Maybe that could
result in a more simplified approach?
Because all we require is to know whether the folio should be unlocked
or not at the time of completion. 

Do you think we should try that part or you think the current approach
looks ok?

-ritesh
Matthew Wilcox April 26, 2024, 6:13 p.m. UTC | #6
On Fri, Apr 26, 2024 at 11:25:25PM +0530, Ritesh Harjani wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> > The approach I suggested was to initialise read_bytes_pending to
> > folio_size() at the start.  Then subtract off blocksize for each
> > uptodate block, whether you find it already uptodate, or as the
> > completion handler runs.
> >
> > Is there a reason that doesn't work?
> 
> That is what this patch series does right. The current patch does work
> as far as my testing goes.
> 
> For e.g. This is what initializes the r_b_p for the first time when
> ifs->r_b_p is 0.
> 
> +		loff_t to_read = min_t(loff_t, iter->len - offset,
> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
> <..>
> +		if (!ifs->read_bytes_pending)
> +			ifs->read_bytes_pending = to_read;
> 
> 
> Then this is where we subtract r_b_p for blocks which are uptodate.
> +		padjust = pos - orig_pos;
> +		ifs->read_bytes_pending -= padjust;
> 
> 
> This is when we adjust r_b_p when we directly zero the folio.
>  	if (iomap_block_needs_zeroing(iter, pos)) {
> +		if (ifs) {
> +			spin_lock_irq(&ifs->state_lock);
> +			ifs->read_bytes_pending -= plen;
> +			if (!ifs->read_bytes_pending)
> +				rbp_finished = true;
> +			spin_unlock_irq(&ifs->state_lock);
> +		}
> 
> But as you see this requires surgery throughout read paths. What if
> we add a state flag to ifs only for BH_BOUNDARY. Maybe that could
> result in a more simplified approach?
> Because all we require is to know whether the folio should be unlocked
> or not at the time of completion. 
> 
> Do you think we should try that part or you think the current approach
> looks ok?

You've really made life hard for yourself.  I had something more like
this in mind.  I may have missed a few places that need to be changed,
but this should update read_bytes_pending everwhere that we set bits
in the uptodate bitmap, so it should be right?

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 41c8f0c68ef5..f87ca8ee4d19 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -79,6 +79,7 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 	if (ifs) {
 		spin_lock_irqsave(&ifs->state_lock, flags);
 		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+		ifs->read_bytes_pending -= len;
 		spin_unlock_irqrestore(&ifs->state_lock, flags);
 	}
 
@@ -208,6 +209,8 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 	spin_lock_init(&ifs->state_lock);
 	if (folio_test_uptodate(folio))
 		bitmap_set(ifs->state, 0, nr_blocks);
+	else
+		ifs->read_bytes_pending = folio_size(folio);
 	if (folio_test_dirty(folio))
 		bitmap_set(ifs->state, nr_blocks, nr_blocks);
 	folio_attach_private(folio, ifs);
@@ -396,12 +399,6 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 	}
 
 	ctx->cur_folio_in_bio = true;
-	if (ifs) {
-		spin_lock_irq(&ifs->state_lock);
-		ifs->read_bytes_pending += plen;
-		spin_unlock_irq(&ifs->state_lock);
-	}
-
 	sector = iomap_sector(iomap, pos);
 	if (!ctx->bio ||
 	    bio_end_sector(ctx->bio) != sector ||
Ritesh Harjani (IBM) April 26, 2024, 6:57 p.m. UTC | #7
Matthew Wilcox <willy@infradead.org> writes:

> On Fri, Apr 26, 2024 at 11:25:25PM +0530, Ritesh Harjani wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>> > The approach I suggested was to initialise read_bytes_pending to
>> > folio_size() at the start.  Then subtract off blocksize for each
>> > uptodate block, whether you find it already uptodate, or as the
>> > completion handler runs.
>> >
>> > Is there a reason that doesn't work?
>> 
>> That is what this patch series does right. The current patch does work
>> as far as my testing goes.
>> 
>> For e.g. This is what initializes the r_b_p for the first time when
>> ifs->r_b_p is 0.
>> 
>> +		loff_t to_read = min_t(loff_t, iter->len - offset,
>> +			folio_size(folio) - offset_in_folio(folio, orig_pos));
>> <..>
>> +		if (!ifs->read_bytes_pending)
>> +			ifs->read_bytes_pending = to_read;
>> 
>> 
>> Then this is where we subtract r_b_p for blocks which are uptodate.
>> +		padjust = pos - orig_pos;
>> +		ifs->read_bytes_pending -= padjust;
>> 
>> 
>> This is when we adjust r_b_p when we directly zero the folio.
>>  	if (iomap_block_needs_zeroing(iter, pos)) {
>> +		if (ifs) {
>> +			spin_lock_irq(&ifs->state_lock);
>> +			ifs->read_bytes_pending -= plen;
>> +			if (!ifs->read_bytes_pending)
>> +				rbp_finished = true;
>> +			spin_unlock_irq(&ifs->state_lock);
>> +		}
>> 
>> But as you see this requires surgery throughout read paths. What if
>> we add a state flag to ifs only for BH_BOUNDARY. Maybe that could
>> result in a more simplified approach?
>> Because all we require is to know whether the folio should be unlocked
>> or not at the time of completion. 
>> 
>> Do you think we should try that part or you think the current approach
>> looks ok?
>
> You've really made life hard for yourself.  I had something more like
> this in mind.  I may have missed a few places that need to be changed,
> but this should update read_bytes_pending everwhere that we set bits
> in the uptodate bitmap, so it should be right?

Please correct me if I am wrong.

>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 41c8f0c68ef5..f87ca8ee4d19 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -79,6 +79,7 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
>  	if (ifs) {
>  		spin_lock_irqsave(&ifs->state_lock, flags);
>  		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
> +		ifs->read_bytes_pending -= len;
>  		spin_unlock_irqrestore(&ifs->state_lock, flags);
>  	}

iomap_set_range_uptodate() gets called from ->write_begin() and
->write_end() too. So what we are saying is we are updating
the state of read_bytes_pending even though we are not in
->read_folio() or ->readahead() call?

>  
> @@ -208,6 +209,8 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
>  	spin_lock_init(&ifs->state_lock);
>  	if (folio_test_uptodate(folio))
>  		bitmap_set(ifs->state, 0, nr_blocks);
> +	else
> +		ifs->read_bytes_pending = folio_size(folio);

We might not come till here during ->read_folio -> ifs_alloc(). Since we
might have a cached ifs which was allocated during write to this folio.

But unless you are saying that during writes, we would have set
ifs->r_b_p to folio_size() and when the read call happens, we use
the same value of the cached ifs.
Ok, I see. I was mostly focusing on updating ifs->r_b_p value only when
the reads bytes are actually pending during ->read_folio() or
->readahead() and not updating r_b_p during writes.

...One small problem which I see with this approach is - we might have
some non-zero value in ifs->r_b_p when ifs_free() gets called and it
might give a warning of non-zero ifs->r_b_p, because we updated
ifs->r_b_p during writes to a non-zero value, but the reads
never happend. Then during a call to ->release_folio, it will complain
of a non-zero ifs->r_b_p.


>  	if (folio_test_dirty(folio))
>  		bitmap_set(ifs->state, nr_blocks, nr_blocks);
>  	folio_attach_private(folio, ifs);
> @@ -396,12 +399,6 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>  	}
>  
>  	ctx->cur_folio_in_bio = true;
> -	if (ifs) {
> -		spin_lock_irq(&ifs->state_lock);
> -		ifs->read_bytes_pending += plen;
> -		spin_unlock_irq(&ifs->state_lock);
> -	}
> -
>  	sector = iomap_sector(iomap, pos);
>  	if (!ctx->bio ||
>  	    bio_end_sector(ctx->bio) != sector ||

-ritesh
Matthew Wilcox April 26, 2024, 7:19 p.m. UTC | #8
On Sat, Apr 27, 2024 at 12:27:52AM +0530, Ritesh Harjani wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> > @@ -79,6 +79,7 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
> >  	if (ifs) {
> >  		spin_lock_irqsave(&ifs->state_lock, flags);
> >  		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
> > +		ifs->read_bytes_pending -= len;
> >  		spin_unlock_irqrestore(&ifs->state_lock, flags);
> >  	}
> 
> iomap_set_range_uptodate() gets called from ->write_begin() and
> ->write_end() too. So what we are saying is we are updating
> the state of read_bytes_pending even though we are not in
> ->read_folio() or ->readahead() call?

Exactly.

> >  
> > @@ -208,6 +209,8 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
> >  	spin_lock_init(&ifs->state_lock);
> >  	if (folio_test_uptodate(folio))
> >  		bitmap_set(ifs->state, 0, nr_blocks);
> > +	else
> > +		ifs->read_bytes_pending = folio_size(folio);
> 
> We might not come till here during ->read_folio -> ifs_alloc(). Since we
> might have a cached ifs which was allocated during write to this folio.
> 
> But unless you are saying that during writes, we would have set
> ifs->r_b_p to folio_size() and when the read call happens, we use
> the same value of the cached ifs.
> Ok, I see. I was mostly focusing on updating ifs->r_b_p value only when
> the reads bytes are actually pending during ->read_folio() or
> ->readahead() and not updating r_b_p during writes.

I see why you might want to think that way ... but this way is much less
complex, don't you think?  ;-)

> ...One small problem which I see with this approach is - we might have
> some non-zero value in ifs->r_b_p when ifs_free() gets called and it
> might give a warning of non-zero ifs->r_b_p, because we updated
> ifs->r_b_p during writes to a non-zero value, but the reads
> never happend. Then during a call to ->release_folio, it will complain
> of a non-zero ifs->r_b_p.

Yes, we'll have to remove that assertion.  I don't think that's a
problem, do you?
Christoph Hellwig April 27, 2024, 4:47 a.m. UTC | #9
On Thu, Apr 25, 2024 at 06:58:51PM +0530, Ritesh Harjani (IBM) wrote:
> Currently the bios for reads within iomap are only submitted at
> 2 places -
> 1. If we cannot merge the new req. with previous bio, only then we
>    submit the previous bio.
> 2. Submit the bio at the end of the entire read processing.
> 
> This means for filesystems with indirect block mapping, we call into
> ->iomap_begin() again w/o submitting the previous bios. That causes
> unoptimized data access patterns for blocks which are of BH_Boundary type.

The same is true for extent mappings.  And it's not ideal there either,
although it only inreases the bio submission latency.
Christoph Hellwig April 27, 2024, 4:54 a.m. UTC | #10
On Fri, Apr 26, 2024 at 08:19:47PM +0100, Matthew Wilcox wrote:
> > ...One small problem which I see with this approach is - we might have
> > some non-zero value in ifs->r_b_p when ifs_free() gets called and it
> > might give a warning of non-zero ifs->r_b_p, because we updated
> > ifs->r_b_p during writes to a non-zero value, but the reads
> > never happend. Then during a call to ->release_folio, it will complain
> > of a non-zero ifs->r_b_p.
> 
> Yes, we'll have to remove that assertion.  I don't think that's a
> problem, do you?

Or refine it, as the parts not read shoud not be uptodate either?

Either way I had another idea to simplify things a bit, but this might
end up beeing even simpler, so I'll stop the hacking on my version
for now.
Ritesh Harjani (IBM) April 27, 2024, 6:03 a.m. UTC | #11
Matthew Wilcox <willy@infradead.org> writes:

> On Sat, Apr 27, 2024 at 12:27:52AM +0530, Ritesh Harjani wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>> > @@ -79,6 +79,7 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
>> >  	if (ifs) {
>> >  		spin_lock_irqsave(&ifs->state_lock, flags);
>> >  		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
>> > +		ifs->read_bytes_pending -= len;
>> >  		spin_unlock_irqrestore(&ifs->state_lock, flags);
>> >  	}
>> 
>> iomap_set_range_uptodate() gets called from ->write_begin() and
>> ->write_end() too. So what we are saying is we are updating
>> the state of read_bytes_pending even though we are not in
>> ->read_folio() or ->readahead() call?
>
> Exactly.
>
>> >  
>> > @@ -208,6 +209,8 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
>> >  	spin_lock_init(&ifs->state_lock);
>> >  	if (folio_test_uptodate(folio))
>> >  		bitmap_set(ifs->state, 0, nr_blocks);
>> > +	else
>> > +		ifs->read_bytes_pending = folio_size(folio);
>> 
>> We might not come till here during ->read_folio -> ifs_alloc(). Since we
>> might have a cached ifs which was allocated during write to this folio.
>> 
>> But unless you are saying that during writes, we would have set
>> ifs->r_b_p to folio_size() and when the read call happens, we use
>> the same value of the cached ifs.
>> Ok, I see. I was mostly focusing on updating ifs->r_b_p value only when
>> the reads bytes are actually pending during ->read_folio() or
>> ->readahead() and not updating r_b_p during writes.
>
> I see why you might want to think that way ... but this way is much less
> complex, don't you think?  ;-)
>
>> ...One small problem which I see with this approach is - we might have
>> some non-zero value in ifs->r_b_p when ifs_free() gets called and it
>> might give a warning of non-zero ifs->r_b_p, because we updated
>> ifs->r_b_p during writes to a non-zero value, but the reads
>> never happend. Then during a call to ->release_folio, it will complain
>> of a non-zero ifs->r_b_p.
>
> Yes, we'll have to remove that assertion.  I don't think that's a
> problem, do you?

Sure, I will give it a spin.

-ritesh
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0a4269095ae2..a1d50086a3f5 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -30,7 +30,7 @@  typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
  */
 struct iomap_folio_state {
 	spinlock_t		state_lock;
-	unsigned int		read_bytes_pending;
+	size_t			read_bytes_pending;
 	atomic_t		write_bytes_pending;

 	/*
@@ -380,6 +380,7 @@  static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 	loff_t orig_pos = pos;
 	size_t poff, plen;
 	sector_t sector;
+	bool rbp_finished = false;

 	if (iomap->type == IOMAP_INLINE)
 		return iomap_read_inline_data(iter, folio);
@@ -387,21 +388,39 @@  static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 	/* zero post-eof blocks as the page may be mapped */
 	ifs = ifs_alloc(iter->inode, folio, iter->flags);
 	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
+
+	if (ifs) {
+		loff_t to_read = min_t(loff_t, iter->len - offset,
+			folio_size(folio) - offset_in_folio(folio, orig_pos));
+		size_t padjust;
+
+		spin_lock_irq(&ifs->state_lock);
+		if (!ifs->read_bytes_pending)
+			ifs->read_bytes_pending = to_read;
+		padjust = pos - orig_pos;
+		ifs->read_bytes_pending -= padjust;
+		if (!ifs->read_bytes_pending)
+			rbp_finished = true;
+		spin_unlock_irq(&ifs->state_lock);
+	}
+
 	if (plen == 0)
 		goto done;

 	if (iomap_block_needs_zeroing(iter, pos)) {
+		if (ifs) {
+			spin_lock_irq(&ifs->state_lock);
+			ifs->read_bytes_pending -= plen;
+			if (!ifs->read_bytes_pending)
+				rbp_finished = true;
+			spin_unlock_irq(&ifs->state_lock);
+		}
 		folio_zero_range(folio, poff, plen);
 		iomap_set_range_uptodate(folio, poff, plen);
 		goto done;
 	}

 	ctx->cur_folio_in_bio = true;
-	if (ifs) {
-		spin_lock_irq(&ifs->state_lock);
-		ifs->read_bytes_pending += plen;
-		spin_unlock_irq(&ifs->state_lock);
-	}

 	sector = iomap_sector(iomap, pos);
 	if (!ctx->bio ||
@@ -435,6 +454,14 @@  static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 	}

 done:
+	/*
+	 * If there is no bio prepared and if rbp is finished and
+	 * this was the last offset within this folio then mark
+	 * cur_folio_in_bio to false.
+	 */
+	if (!ctx->bio && rbp_finished &&
+			offset_in_folio(folio, pos + plen) == 0)
+		ctx->cur_folio_in_bio = false;
 	/*
 	 * Move the caller beyond our range so that it keeps making progress.
 	 * For that, we have to include any leading non-uptodate ranges, but
@@ -459,9 +486,43 @@  static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
 			return ret;
 	}

+	if (ctx->bio) {
+		submit_bio(ctx->bio);
+		WARN_ON_ONCE(!ctx->cur_folio_in_bio);
+		ctx->bio = NULL;
+	}
+	if (offset_in_folio(folio, iter->pos + done) == 0 &&
+			!ctx->cur_folio_in_bio) {
+		folio_unlock(ctx->cur_folio);
+	}
+
 	return done;
 }

+static void iomap_handle_read_error(struct iomap_readpage_ctx *ctx,
+		struct iomap_iter *iter)
+{
+	struct folio *folio = ctx->cur_folio;
+	struct iomap_folio_state *ifs;
+	unsigned long flags;
+	bool rbp_finished = false;
+	size_t rbp_adjust = folio_size(folio) - offset_in_folio(folio,
+				iter->pos);
+	ifs = folio->private;
+	if (!ifs || !ifs->read_bytes_pending)
+		goto unlock;
+
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	ifs->read_bytes_pending -= rbp_adjust;
+	if (!ifs->read_bytes_pending)
+		rbp_finished = true;
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+
+unlock:
+	if (rbp_finished || !ctx->cur_folio_in_bio)
+		folio_unlock(folio);
+}
+
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
 {
 	struct iomap_iter iter = {
@@ -479,15 +540,9 @@  int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.processed = iomap_read_folio_iter(&iter, &ctx);

-	if (ret < 0)
+	if (ret < 0) {
 		folio_set_error(folio);
-
-	if (ctx.bio) {
-		submit_bio(ctx.bio);
-		WARN_ON_ONCE(!ctx.cur_folio_in_bio);
-	} else {
-		WARN_ON_ONCE(ctx.cur_folio_in_bio);
-		folio_unlock(folio);
+		iomap_handle_read_error(&ctx, &iter);
 	}

 	/*
@@ -506,12 +561,6 @@  static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
 	loff_t done, ret;

 	for (done = 0; done < length; done += ret) {
-		if (ctx->cur_folio &&
-		    offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
-			if (!ctx->cur_folio_in_bio)
-				folio_unlock(ctx->cur_folio);
-			ctx->cur_folio = NULL;
-		}
 		if (!ctx->cur_folio) {
 			ctx->cur_folio = readahead_folio(ctx->rac);
 			ctx->cur_folio_in_bio = false;
@@ -519,6 +568,17 @@  static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
 		ret = iomap_readpage_iter(iter, ctx, done);
 		if (ret <= 0)
 			return ret;
+		if (ctx->cur_folio && offset_in_folio(ctx->cur_folio,
+					iter->pos + done + ret) == 0) {
+			if (!ctx->cur_folio_in_bio)
+				folio_unlock(ctx->cur_folio);
+			ctx->cur_folio = NULL;
+		}
+	}
+
+	if (ctx->bio) {
+		submit_bio(ctx->bio);
+		ctx->bio = NULL;
 	}

 	return done;
@@ -549,18 +609,16 @@  void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
 	struct iomap_readpage_ctx ctx = {
 		.rac	= rac,
 	};
+	int ret = 0;

 	trace_iomap_readahead(rac->mapping->host, readahead_count(rac));

-	while (iomap_iter(&iter, ops) > 0)
+	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.processed = iomap_readahead_iter(&iter, &ctx);

-	if (ctx.bio)
-		submit_bio(ctx.bio);
-	if (ctx.cur_folio) {
-		if (!ctx.cur_folio_in_bio)
-			folio_unlock(ctx.cur_folio);
-	}
+	if (ret < 0 && ctx.cur_folio)
+		iomap_handle_read_error(&ctx, &iter);
+
 }
 EXPORT_SYMBOL_GPL(iomap_readahead);