diff mbox series

[17/17] fs: add CONFIG_BUFFER_HEAD

Message ID 20230424054926.26927-18-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/17] fs: unexport buffer_check_dirty_writeback | expand

Commit Message

Christoph Hellwig April 24, 2023, 5:49 a.m. UTC
Add a new config option that controls building the buffer_head code, and
select it from all file systems and stacking drivers that need it.

For the block device nodes and alternative iomap based buffered I/O path
is provided when buffer_head support is not enabled, and iomap needs a
little tweak to be able to compile out the buffer_head based code path.

Otherwise this is just Kconfig and ifdef changes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/fops.c                 | 71 +++++++++++++++++++++++++++++++-----
 drivers/md/Kconfig           |  1 +
 fs/Kconfig                   |  4 ++
 fs/Makefile                  |  2 +-
 fs/adfs/Kconfig              |  1 +
 fs/affs/Kconfig              |  1 +
 fs/befs/Kconfig              |  1 +
 fs/bfs/Kconfig               |  1 +
 fs/efs/Kconfig               |  1 +
 fs/exfat/Kconfig             |  1 +
 fs/ext2/Kconfig              |  1 +
 fs/ext4/Kconfig              |  1 +
 fs/f2fs/Kconfig              |  1 +
 fs/fat/Kconfig               |  1 +
 fs/freevxfs/Kconfig          |  1 +
 fs/gfs2/Kconfig              |  1 +
 fs/hfs/Kconfig               |  1 +
 fs/hfsplus/Kconfig           |  1 +
 fs/hpfs/Kconfig              |  1 +
 fs/iomap/buffered-io.c       | 12 ++++--
 fs/isofs/Kconfig             |  1 +
 fs/jfs/Kconfig               |  1 +
 fs/minix/Kconfig             |  1 +
 fs/nilfs2/Kconfig            |  1 +
 fs/ntfs/Kconfig              |  1 +
 fs/ntfs3/Kconfig             |  1 +
 fs/ocfs2/Kconfig             |  1 +
 fs/omfs/Kconfig              |  1 +
 fs/qnx4/Kconfig              |  1 +
 fs/qnx6/Kconfig              |  1 +
 fs/reiserfs/Kconfig          |  1 +
 fs/sysv/Kconfig              |  1 +
 fs/udf/Kconfig               |  1 +
 fs/ufs/Kconfig               |  1 +
 include/linux/buffer_head.h  | 32 ++++++++--------
 include/trace/events/block.h |  2 +
 mm/migrate.c                 |  4 +-
 37 files changed, 125 insertions(+), 32 deletions(-)

Comments

Luis Chamberlain April 29, 2023, 12:11 a.m. UTC | #1
On Mon, Apr 24, 2023 at 07:49:26AM +0200, Christoph Hellwig wrote:
> +const struct address_space_operations def_blk_aops = {
> +	.dirty_folio	= filemap_dirty_folio,
> +	.release_folio		= iomap_release_folio,
> +	.invalidate_folio	= iomap_invalidate_folio,
> +	.read_folio		= blkdev_read_folio,
> +	.readahead		= blkdev_readahead,
> +	.writepages		= blkdev_writepages,
> +	.is_partially_uptodate  = iomap_is_partially_uptodate,
> +	.error_remove_page	= generic_error_remove_page,
> +	.migrate_folio		= filemap_migrate_folio,
> +};
> +#endif /* CONFIG_BUFFER_HEAD */

We've tested this with bs > ps (LBS) devices and it would seem it crashes,
as Pankaj notes perhaps due to lack of higher order folio support yet
on this path, for the block cache. The same crash happens with NVMe
(using out-of-tree nvme_core.debug_large_lbas boot parameter to enable NVMe
LBS) or brd with LBS. To enable NVMe LBS or brd with LBS you need
out of tree patches though of course, so I've stashed these into
a branch, large-block-20230426 [0] so to help folks who may want
to experiment further.

[0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux-next.git/log/?h=large-block-20230426

[   11.245248] BUG: kernel NULL pointer dereference, address: 0000000000000000
[   11.254581] #PF: supervisor read access in kernel mode
[   11.257387] #PF: error_code(0x0000) - not-present page
[   11.260921] PGD 0 P4D 0
[   11.262600] Oops: 0000 [#1] PREEMPT SMP PTI
[   11.264993] CPU: 7 PID: 198 Comm: (udev-worker) Not tainted 6.3.0-large-block-20230426 #2
[   11.269385] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[   11.275054] RIP: 0010:iomap_page_create.isra.0+0xc/0xd0
[   11.277924] Code: 41 5e 41 5f c3 cc cc cc cc 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 54 55 48 89 f5 53 <48> 8b 06 48 c1 e8 0d 89 c6 83 e6 01 0f 84 a1 00 00 00 4c 8b 65 28
[   11.287293] RSP: 0018:ffffb0f0805ef9d8 EFLAGS: 00010293
[   11.289964] RAX: ffff9de3c1fa8388 RBX: ffffb0f0805efa78 RCX: 000000037ffe0000
[   11.293212] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000000000d
[   11.296485] RBP: 0000000000000000 R08: 0000000000021000 R09: ffffffff9c733b20
[   11.299724] R10: 0000000000000001 R11: 000000000000c000 R12: 0000000000000000
[   11.302974] R13: ffffffff9be96260 R14: ffffb0f0805efa58 R15: 0000000000000000
[   11.306206] FS:  00007f03ea8368c0(0000) GS:ffff9de43bdc0000(0000) knlGS:0000000000000000
[   11.309949] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   11.312464] CR2: 0000000000000000 CR3: 0000000117ec6006 CR4: 0000000000770ee0
[   11.315442] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   11.318310] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   11.321010] PKRU: 55555554
[   11.322212] Call Trace:
[   11.323224]  <TASK>
[   11.324146]  iomap_readpage_iter+0x96/0x300
[   11.325694]  iomap_readahead+0x174/0x2d0
[   11.327129]  read_pages+0x69/0x1f0
[   11.328389]  ? folio_add_lru+0x7e/0xe0
[   11.329751]  page_cache_ra_unbounded+0x187/0x1d0
[   11.331301]  force_page_cache_ra+0x94/0xb0
[   11.332681]  filemap_get_pages+0x10e/0x650
[   11.334073]  ? _raw_spin_lock+0x13/0x40
[   11.335287]  filemap_read+0xbf/0x340
[   11.336430]  ? aa_file_perm+0x117/0x4b0
[   11.337646]  ? generic_fillattr+0x45/0xf0
[   11.338887]  ? _copy_to_user+0x22/0x30
[   11.340026]  ? cp_new_stat+0x150/0x180
[   11.341166]  blkdev_read_iter+0x5e/0x140
[   11.342357]  vfs_read+0x1f0/0x2c0
[   11.343354]  ksys_read+0x63/0xe0
[   11.344331]  do_syscall_64+0x37/0x90
[   11.345411]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[   11.346760] RIP: 0033:0x7f03eaf3903d

(gdb) l *(iomap_readpage_iter+0x96)
0xffffffff814021b6 is in iomap_readpage_iter (fs/iomap/buffered-io.c:280).
275             if (iomap->type == IOMAP_INLINE)
276                     return iomap_read_inline_data(iter, folio);
277
278             /* zero post-eof blocks as the page may be mapped */
279             iop = iomap_page_create(iter->inode, folio, iter->flags);
280             iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
281             if (plen == 0)
282                     goto done;
283
284             if (iomap_block_needs_zeroing(iter, pos)) {
(gdb) l *(iomap_page_create+0xc)
0xffffffff81400cdc is in iomap_page_create (./arch/x86/include/asm/bitops.h:207).
202     }
203
204     static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
205     {
206             return ((1UL << (nr & (BITS_PER_LONG-1))) &
207                     (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
208     }
209
210     static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
211     {

To reproduce one would want a system with only say XFS as the root
image. I've enabled this on kdevops through "pure-iomap" option:

https://github.com/linux-kdevops/kdevops/blob/master/docs/lbs.md

  Luis
Matthew Wilcox April 29, 2023, 1:20 a.m. UTC | #2
On Fri, Apr 28, 2023 at 05:11:57PM -0700, Luis Chamberlain wrote:
> [   11.245248] BUG: kernel NULL pointer dereference, address: 0000000000000000
> [   11.254581] #PF: supervisor read access in kernel mode
> [   11.257387] #PF: error_code(0x0000) - not-present page
> [   11.260921] PGD 0 P4D 0
> [   11.262600] Oops: 0000 [#1] PREEMPT SMP PTI
> [   11.264993] CPU: 7 PID: 198 Comm: (udev-worker) Not tainted 6.3.0-large-block-20230426 #2
> [   11.269385] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
> [   11.275054] RIP: 0010:iomap_page_create.isra.0+0xc/0xd0
> [   11.277924] Code: 41 5e 41 5f c3 cc cc cc cc 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 54 55 48 89 f5 53 <48> 8b 06 48 c1 e8 0d 89 c6 83 e6 01 0f 84 a1 00 00 00 4c 8b 65 28
> [   11.287293] RSP: 0018:ffffb0f0805ef9d8 EFLAGS: 00010293
> [   11.289964] RAX: ffff9de3c1fa8388 RBX: ffffb0f0805efa78 RCX: 000000037ffe0000
> [   11.293212] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000000000d
> [   11.296485] RBP: 0000000000000000 R08: 0000000000021000 R09: ffffffff9c733b20
> [   11.299724] R10: 0000000000000001 R11: 000000000000c000 R12: 0000000000000000
> [   11.302974] R13: ffffffff9be96260 R14: ffffb0f0805efa58 R15: 0000000000000000

RSI is argument 2, which is folio.

Code starting with the faulting instruction
===========================================
   0:	48 8b 06             	mov    (%rsi),%rax
   3:	48 c1 e8 0d          	shr    $0xd,%rax

Looks to me like a NULL folio was passed into iomap_page_create().

> [   11.306206] FS:  00007f03ea8368c0(0000) GS:ffff9de43bdc0000(0000) knlGS:0000000000000000
> [   11.309949] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   11.312464] CR2: 0000000000000000 CR3: 0000000117ec6006 CR4: 0000000000770ee0
> [   11.315442] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   11.318310] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   11.321010] PKRU: 55555554
> [   11.322212] Call Trace:
> [   11.323224]  <TASK>
> [   11.324146]  iomap_readpage_iter+0x96/0x300
> [   11.325694]  iomap_readahead+0x174/0x2d0
> [   11.327129]  read_pages+0x69/0x1f0
> [   11.329751]  page_cache_ra_unbounded+0x187/0x1d0

... that shouldn't be possible.  read_pages() allocates pages, puts them
in the page cache and tells the filesystem to fill them in.

In your patches, did you call mapping_set_large_folios() anywhere?
Luis Chamberlain May 1, 2023, 3:14 a.m. UTC | #3
On Sat, Apr 29, 2023 at 02:20:17AM +0100, Matthew Wilcox wrote:
> On Fri, Apr 28, 2023 at 05:11:57PM -0700, Luis Chamberlain wrote:
> > [   11.245248] BUG: kernel NULL pointer dereference, address: 0000000000000000
> > [   11.254581] #PF: supervisor read access in kernel mode
> > [   11.257387] #PF: error_code(0x0000) - not-present page
> > [   11.260921] PGD 0 P4D 0
> > [   11.262600] Oops: 0000 [#1] PREEMPT SMP PTI
> > [   11.264993] CPU: 7 PID: 198 Comm: (udev-worker) Not tainted 6.3.0-large-block-20230426 #2
> > [   11.269385] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
> > [   11.275054] RIP: 0010:iomap_page_create.isra.0+0xc/0xd0
> > [   11.277924] Code: 41 5e 41 5f c3 cc cc cc cc 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 54 55 48 89 f5 53 <48> 8b 06 48 c1 e8 0d 89 c6 83 e6 01 0f 84 a1 00 00 00 4c 8b 65 28
> > [   11.287293] RSP: 0018:ffffb0f0805ef9d8 EFLAGS: 00010293
> > [   11.289964] RAX: ffff9de3c1fa8388 RBX: ffffb0f0805efa78 RCX: 000000037ffe0000
> > [   11.293212] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000000000d
> > [   11.296485] RBP: 0000000000000000 R08: 0000000000021000 R09: ffffffff9c733b20
> > [   11.299724] R10: 0000000000000001 R11: 000000000000c000 R12: 0000000000000000
> > [   11.302974] R13: ffffffff9be96260 R14: ffffb0f0805efa58 R15: 0000000000000000
> 
> RSI is argument 2, which is folio.
> 
> Code starting with the faulting instruction
> ===========================================
>    0:	48 8b 06             	mov    (%rsi),%rax
>    3:	48 c1 e8 0d          	shr    $0xd,%rax
> 
> Looks to me like a NULL folio was passed into iomap_page_create().
> 
> > [   11.306206] FS:  00007f03ea8368c0(0000) GS:ffff9de43bdc0000(0000) knlGS:0000000000000000
> > [   11.309949] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [   11.312464] CR2: 0000000000000000 CR3: 0000000117ec6006 CR4: 0000000000770ee0
> > [   11.315442] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [   11.318310] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [   11.321010] PKRU: 55555554
> > [   11.322212] Call Trace:
> > [   11.323224]  <TASK>
> > [   11.324146]  iomap_readpage_iter+0x96/0x300
> > [   11.325694]  iomap_readahead+0x174/0x2d0
> > [   11.327129]  read_pages+0x69/0x1f0
> > [   11.329751]  page_cache_ra_unbounded+0x187/0x1d0
> 
> ... that shouldn't be possible.  read_pages() allocates pages, puts them
> in the page cache and tells the filesystem to fill them in.
> 
> In your patches, did you call mapping_set_large_folios() anywhere?

No but the only place to add that would be in the block cache. Adding
that alone to the block cache doesn't fix the issue. The below patch
however does get us by.

From my readings it does't seem like readahead_folio() should always
return non-NULL, and also I couldn't easily verify the math is right.
The max cap I see is for the backing device io_size, but there are
some other heuristics which will take me some time to try to grok
to understand if they are correct.

diff --git a/block/bdev.c b/block/bdev.c
index 21c63bfef323..afa0c5ebd364 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -411,6 +411,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	inode->i_rdev = 0;
 	inode->i_data.a_ops = &def_blk_aops;
 	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+	mapping_set_large_folios(inode->i_mapping);
 
 	bdev = I_BDEV(inode);
 	mutex_init(&bdev->bd_fsfreeze_mutex);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8115b0d9a85a..d3c9e16a7066 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -384,7 +384,10 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
 		}
 		if (!ctx->cur_folio) {
 			ctx->cur_folio = readahead_folio(ctx->rac);
-			ctx->cur_folio_in_bio = false;
+			if (ctx->cur_folio)
+				ctx->cur_folio_in_bio = false;
+			else
+				continue;
 		}
 		ret = iomap_readpage_iter(iter, ctx, done);
 		if (ret <= 0)

We still see:

Apr 30 19:54:21 iomap kernel: ------------[ cut here ]------------
Apr 30 19:54:21 iomap kernel: WARNING: CPU: 4 PID: 197 at block/fops.c:389 blkdev_iomap_begin+0x80/0x90
Apr 30 19:54:21 iomap kernel: Modules linked in: psmouse virtio_blk failover nvme nvme_core crc32c_intel t10_pi virtio_pci
virtio_pci_legacy_dev virtio_pci_modern_dev virtio crc64_rocksoft >
Apr 30 19:54:21 iomap kernel: CPU: 4 PID: 197 Comm: (udev-worker) Not tainted 6.3.0-large-block-20230426-dirty #7
Apr 30 19:54:21 iomap kernel: Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
Apr 30 19:54:21 iomap kernel: RIP: 0010:blkdev_iomap_begin+0x80/0x90
Apr 30 19:54:21 iomap kernel: Code: c0 48 89 53 10 5b 5d 41 5c c3 cc cc
cc cc 89 c6 83 e8 01 48 8d 6c 2e ff 48 f7 de 48 29 c5 48 21 ee 48 89 73
08 48 39 d6 7c c6 <0f> 0b b8 fb ff ff ff 5b 5>
Apr 30 19:54:21 iomap kernel: RSP: 0018:ffffa437c0723a08 EFLAGS: 00010246
Apr 30 19:54:21 iomap kernel: RAX: 0000000000007fff RBX: ffffa437c0723aa0 RCX: 0000000000000000
Apr 30 19:54:21 iomap kernel: RDX: 0000000400000000 RSI: 0000000400000000 RDI: ffff92f9c23c5788
Apr 30 19:54:21 iomap kernel: RBP: 0000000400000000 R08: ffffa437c0723aa0 R09: ffffa437c0723af0
Apr 30 19:54:21 iomap kernel: R10: 0000000000000001 R11: 000000000000c000 R12: ffff92f9c23c5788
Apr 30 19:54:21 iomap kernel: R13: ffffa437c0723af0 R14: ffffffff828962e0 R15: ffffa437c0723cc8
Apr 30 19:54:21 iomap kernel: FS:  00007febee04b8c0(0000) GS:ffff92fa3bd00000(0000) knlGS:0000000000000000
Apr 30 19:54:21 iomap kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Apr 30 19:54:21 iomap kernel: CR2: 00007ffd68325108 CR3: 0000000102a9a005 CR4: 0000000000770ee0
Apr 30 19:54:21 iomap kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Apr 30 19:54:21 iomap kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Apr 30 19:54:21 iomap kernel: PKRU: 55555554
Apr 30 19:54:21 iomap kernel: Call Trace:
Apr 30 19:54:21 iomap kernel:  <TASK>
Apr 30 19:54:21 iomap kernel:  iomap_iter+0x179/0x350
Apr 30 19:54:21 iomap kernel:  iomap_readahead+0x200/0x2e0
Apr 30 19:54:21 iomap kernel:  read_pages+0x69/0x1f0
Apr 30 19:54:21 iomap kernel:  ? folio_add_lru+0x7e/0xe0
Apr 30 19:54:21 iomap kernel:  page_cache_ra_unbounded+0x187/0x1d0
Apr 30 19:54:21 iomap kernel:  force_page_cache_ra+0x94/0xb0
Apr 30 19:54:21 iomap kernel:  filemap_get_pages+0x10e/0x650
Apr 30 19:54:21 iomap kernel:  ? _raw_spin_lock+0x13/0x40
Apr 30 19:54:21 iomap kernel:  ? _raw_spin_unlock+0x15/0x30
Apr 30 19:54:21 iomap kernel:  ? __mark_inode_dirty+0x155/0x380
Apr 30 19:54:21 iomap kernel:  filemap_read+0xbf/0x340
Apr 30 19:54:21 iomap kernel:  ? aa_file_perm+0x117/0x4b0
Apr 30 19:54:21 iomap kernel:  ? generic_fillattr+0x45/0xf0
Apr 30 19:54:21 iomap kernel:  ? _copy_to_user+0x22/0x30
Apr 30 19:54:21 iomap kernel:  ? cp_new_stat+0x150/0x180
Apr 30 19:54:21 iomap kernel:  blkdev_read_iter+0x5e/0x140
Apr 30 19:54:21 iomap kernel:  vfs_read+0x1f0/0x2c0
Apr 30 19:54:21 iomap kernel:  ksys_read+0x63/0xe0
Apr 30 19:54:21 iomap kernel:  do_syscall_64+0x37/0x90
Apr 30 19:54:21 iomap kernel:  entry_SYSCALL_64_after_hwframe+0x72/0xdc
Apr 30 19:54:21 iomap kernel: RIP: 0033:0x7febee74e03d
Apr 30 19:54:21 iomap kernel: Code: 31 c0 e9 c6 fe ff ff 50 48 8d 3d a6
55 0a 00 e8 39 fe 01 00 66 0f 1f 84 00 00 00 00 00 80 3d a1 25 0e 00 00
74 17 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 5b c>
Apr 30 19:54:21 iomap kernel: RSP: 002b:00007ffd68329148 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
Apr 30 19:54:21 iomap kernel: RAX: ffffffffffffffda RBX: 000055b30a902020 RCX: 00007febee74e03d
Apr 30 19:54:21 iomap kernel: RDX: 0000000000000100 RSI: 000055b30a905138 RDI: 000000000000000c
Apr 30 19:54:21 iomap kernel: RBP: 00000003ffffe000 R08: 00007febee828d40 R09: 00007febee828d40
Apr 30 19:54:21 iomap kernel: R10: 0000000000000000 R11: 0000000000000246 R12: 000055b30a905110
Apr 30 19:54:21 iomap kernel: R13: 0000000000000100 R14: 000055b30a902078 R15: 000055b30a905128
Apr 30 19:54:21 iomap kernel:  </TASK>
Apr 30 19:54:21 iomap kernel: ---[ end trace 0000000000000000 ]---

And then also:

Apr 30 19:54:21 iomap kernel: ------------[ cut here ]------------
Apr 30 19:54:21 iomap kernel: WARNING: CPU: 4 PID: 197 at fs/iomap/iter.c:32 iomap_iter+0x335/0x350
Apr 30 19:54:21 iomap kernel: Modules linked in: psmouse virtio_blk failover nvme nvme_core crc32c_intel t10_pi virtio_pci
virtio_pci_legacy_dev virtio_pci_modern_dev virtio crc64_rocksoft >
Apr 30 19:54:21 iomap kernel: CPU: 4 PID: 197 Comm: (udev-worker) Tainted: G        W          6.3.0-large-block-20230426-dirty #7
Apr 30 19:54:21 iomap kernel: Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
Apr 30 19:54:21 iomap kernel: RIP: 0010:iomap_iter+0x335/0x350
Apr 30 19:54:21 iomap kernel: Code: b8 fe ff ff e8 dc 29 c0 ff e9 ae fe
ff ff 0f 0b e9 71 fe ff ff 0f 0b e9 77 fe ff ff 0f 0b e9 7c fe ff ff 0f
0b e9 7f fe ff ff <0f> 0b b8 fb ff ff ff e9 8>
Apr 30 19:54:21 iomap kernel: RSP: 0018:ffffa437c0723a28 EFLAGS: 00010287
Apr 30 19:54:21 iomap kernel: RAX: 0000000000008000 RBX: ffffa437c0723a78 RCX: 0000000000008000
Apr 30 19:54:21 iomap kernel: RDX: 000000000000a000 RSI:00000003fffab000 RDI: ffffa437c0723a78
Apr 30 19:54:21 iomap kernel: RBP: ffffffff828962e0 R08: 0000000000005000 R09: ffffffff83133b20
Apr 30 19:54:21 iomap kernel: R10: 0000000000000001 R11: 000000000000c000 R12: 0000000000008000
Apr 30 19:54:21 iomap kernel: R13: ffffa437c0723a78 R14: ffffffff828962e0 R15: ffffa437c0723cc8
Apr 30 19:54:21 iomap kernel: FS:  00007febee04b8c0(0000) GS:ffff92fa3bd00000(0000) knlGS:0000000000000000
Apr 30 19:54:21 iomap kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Apr 30 19:54:21 iomap kernel: CR2: 00007ffd68325108 CR3: 0000000102a9a005 CR4: 0000000000770ee0
Apr 30 19:54:21 iomap kernel: DR0: 0000000000000000 DR1:0000000000000000 DR2: 0000000000000000
Apr 30 19:54:21 iomap kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Apr 30 19:54:21 iomap kernel: PKRU: 55555554
Apr 30 19:54:21 iomap kernel: Call Trace:
Apr 30 19:54:21 iomap kernel:  <TASK>
Apr 30 19:54:21 iomap kernel:  iomap_readahead+0x200/0x2e0
Apr 30 19:54:21 iomap kernel:  read_pages+0x69/0x1f0
Apr 30 19:54:21 iomap kernel:  ? folio_add_lru+0x7e/0xe0
Apr 30 19:54:21 iomap kernel:  page_cache_ra_unbounded+0x187/0x1d0
Apr 30 19:54:21 iomap kernel:  force_page_cache_ra+0x94/0xb0
Apr 30 19:54:21 iomap kernel:  filemap_get_pages+0x10e/0x650
Apr 30 19:54:21 iomap kernel:  filemap_read+0xbf/0x340
Apr 30 19:54:21 iomap kernel:  ? tomoyo_path_number_perm+0x68/0x1d0
Apr 30 19:54:21 iomap kernel:  ? aa_file_perm+0x117/0x4b0
Apr 30 19:54:21 iomap kernel:  blkdev_read_iter+0x5e/0x140
Apr 30 19:54:21 iomap kernel:  vfs_read+0x1f0/0x2c0
Apr 30 19:54:21 iomap kernel:  ksys_read+0x63/0xe0
Apr 30 19:54:21 iomap kernel:  do_syscall_64+0x37/0x90
Apr 30 19:54:21 iomap kernel:  entry_SYSCALL_64_after_hwframe+0x72/0xdc
Apr 30 19:54:21 iomap kernel: RIP: 0033:0x7febee74e03d
Apr 30 19:54:21 iomap kernel: Code: 31 c0 e9 c6 fe ff ff 50 48 8d 3d a6
55 0a 00 e8 39 fe 01 00 66 0f 1f 84 00 00 00 00 00 80 3d a1 25 0e 00 00
74 17 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 5b c>
Apr 30 19:54:21 iomap kernel: RSP: 002b:00007ffd68329198 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
Apr 30 19:54:21 iomap kernel: RAX: ffffffffffffffda RBX: 000055b30a902020 RCX: 00007febee74e03d
Apr 30 19:54:21 iomap kernel: RDX: 0000000000000018 RSI:i 000055b30a8fe518 RDI: 000000000000000c
Apr 30 19:54:21 iomap kernel: RBP: 00000003fffaba00 R08: 00007febee828cc0 R09: 0000000000000070
Apr 30 19:54:21 iomap kernel: R10: 0000000000000000 R11:0000000000000246 R12: 000055b30a8fe4f0
Apr 30 19:54:21 iomap kernel: R13: 0000000000000018 R14: 000055b30a902078 R15: 000055b30a8fe508
Apr 30 19:54:21 iomap kernel:  </TASK>
Apr 30 19:54:21 iomap kernel: ---[ end trace 0000000000000000 ]---
Matthew Wilcox May 1, 2023, 3:46 p.m. UTC | #4
On Sun, Apr 30, 2023 at 08:14:03PM -0700, Luis Chamberlain wrote:
> On Sat, Apr 29, 2023 at 02:20:17AM +0100, Matthew Wilcox wrote:
> > > [   11.322212] Call Trace:
> > > [   11.323224]  <TASK>
> > > [   11.324146]  iomap_readpage_iter+0x96/0x300
> > > [   11.325694]  iomap_readahead+0x174/0x2d0
> > > [   11.327129]  read_pages+0x69/0x1f0
> > > [   11.329751]  page_cache_ra_unbounded+0x187/0x1d0
> > 
> > ... that shouldn't be possible.  read_pages() allocates pages, puts them
> > in the page cache and tells the filesystem to fill them in.
> > 
> > In your patches, did you call mapping_set_large_folios() anywhere?
> 
> No but the only place to add that would be in the block cache. Adding
> that alone to the block cache doesn't fix the issue. The below patch
> however does get us by.

That's "working around the error", not fixing it ... probably the same
root cause as your other errors; at least I'm not diving into them until
the obvious one is fixed.

> >From my readings it does't seem like readahead_folio() should always
> return non-NULL, and also I couldn't easily verify the math is right.

readahead_folio() always returns non-NULL.  That's guaranteed by how
page_cache_ra_unbounded() and page_cache_ra_order() work.  It allocates
folios, until it can't (already-present folio, ENOMEM, EOF, max batch
size) and then calls the filesystem to make those folios uptodate,
telling it how many folios it put in the page cache, where they start.

Hm.  The fact that it's coming from page_cache_ra_unbounded() makes
me wonder if you updated this line:

                folio = filemap_alloc_folio(gfp_mask, 0);

without updating this line:

                ractl->_nr_pages++;

This is actually number of pages, not number of folios, so needs to be
		ractl->_nr_pages += 1 << order;

various other parts of page_cache_ra_unbounded() need to be examined
carefully for assumptions of order-0; it's never been used for that
before.  all the large folio work has concentrated on
page_cache_ra_order()
Pankaj Raghav May 1, 2023, 4 p.m. UTC | #5
>> No but the only place to add that would be in the block cache. Adding
>> that alone to the block cache doesn't fix the issue. The below patch
>> however does get us by.
> 
> That's "working around the error", not fixing it ... probably the same
> root cause as your other errors; at least I'm not diving into them until
> the obvious one is fixed.
> 
>> >From my readings it does't seem like readahead_folio() should always
>> return non-NULL, and also I couldn't easily verify the math is right.
> 
> readahead_folio() always returns non-NULL.  That's guaranteed by how
> page_cache_ra_unbounded() and page_cache_ra_order() work.  It allocates
> folios, until it can't (already-present folio, ENOMEM, EOF, max batch
> size) and then calls the filesystem to make those folios uptodate,
> telling it how many folios it put in the page cache, where they start.
> 
> Hm.  The fact that it's coming from page_cache_ra_unbounded() makes
> me wonder if you updated this line:
> 
>                 folio = filemap_alloc_folio(gfp_mask, 0);
> 
> without updating this line:
> 
>                 ractl->_nr_pages++;
> 
> This is actually number of pages, not number of folios, so needs to be
> 		ractl->_nr_pages += 1 << order;
> 

I already had a patch which did the following:

ractl->_nr_pages += folio_nr_pages(folio);

but the variable `i` in the loop was not updated properly (assumption of zero order folio). This now
fixes the crash:

@@ -210,7 +210,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
        unsigned long index = readahead_index(ractl);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long i;
-
+       int order = 0;
        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
@@ -223,6 +223,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
         */
        unsigned int nofs = memalloc_nofs_save();

+       if (mapping->host->i_blkbits > PAGE_SHIFT)
+               order = mapping->host->i_blkbits - PAGE_SHIFT;
+
        filemap_invalidate_lock_shared(mapping);
        /*
         * Preallocate as many pages as we will need.
@@ -245,7 +248,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
                        continue;
                }

-               folio = filemap_alloc_folio(gfp_mask, 0);
+               folio = filemap_alloc_folio(gfp_mask, order);
                if (!folio)
                        break;
                if (filemap_add_folio(mapping, folio, index + i,
@@ -259,7 +262,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
                if (i == nr_to_read - lookahead_size)
                        folio_set_readahead(folio);
                ractl->_workingset |= folio_test_workingset(folio);
-               ractl->_nr_pages++;
+               ractl->_nr_pages += folio_nr_pages(folio);
+               i += folio_nr_pages(folio) - 1;
        }

> various other parts of page_cache_ra_unbounded() need to be examined
> carefully for assumptions of order-0; it's never been used for that
> before.  all the large folio work has concentrated on
> page_cache_ra_order()

As you have noted here, this needs to be examined more carefully. Even though the patches fix the
crash, fio with verify option fails (i.e write and read are not giving the same output).

I think it is better to send an RFC patch series on top of Christoph's work with optional
BUFFER_HEAD to iron out some core issues/bugs.
diff mbox series

Patch

diff --git a/block/fops.c b/block/fops.c
index 7910636f8df33b..524b8a828aad3d 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -24,15 +24,6 @@  static inline struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
-static int blkdev_get_block(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh, int create)
-{
-	bh->b_bdev = I_BDEV(inode);
-	bh->b_blocknr = iblock;
-	set_buffer_mapped(bh);
-	return 0;
-}
-
 static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
 {
 	blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@@ -400,7 +391,8 @@  static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	iomap->type = IOMAP_MAPPED;
 	iomap->addr = iomap->offset;
 	iomap->length = isize - iomap->offset;
-	iomap->flags |= IOMAP_F_BUFFER_HEAD;
+	if (IS_ENABLED(CONFIG_BUFFER_HEAD))
+		iomap->flags |= IOMAP_F_BUFFER_HEAD;
 	return 0;
 }
 
@@ -408,6 +400,16 @@  static const struct iomap_ops blkdev_iomap_ops = {
 	.iomap_begin		= blkdev_iomap_begin,
 };
 
+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh, int create)
+{
+	bh->b_bdev = I_BDEV(inode);
+	bh->b_blocknr = iblock;
+	set_buffer_mapped(bh);
+	return 0;
+}
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
@@ -453,6 +455,55 @@  const struct address_space_operations def_blk_aops = {
 	.migrate_folio	= buffer_migrate_folio_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+	return iomap_read_folio(folio, &blkdev_iomap_ops);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+	iomap_readahead(rac, &blkdev_iomap_ops);
+}
+
+static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
+		struct inode *inode, loff_t offset)
+{
+	loff_t isize = i_size_read(inode);
+
+	if (WARN_ON_ONCE(offset >= isize))
+		return -EIO;
+	if (offset >= wpc->iomap.offset &&
+	    offset < wpc->iomap.offset + wpc->iomap.length)
+		return 0;
+	return blkdev_iomap_begin(inode, offset, isize - offset,
+				  IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+	.map_blocks		= blkdev_map_blocks,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct iomap_writepage_ctx wpc = { };
+
+	return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+}
+
+const struct address_space_operations def_blk_aops = {
+	.dirty_folio	= filemap_dirty_folio,
+	.release_folio		= iomap_release_folio,
+	.invalidate_folio	= iomap_invalidate_folio,
+	.read_folio		= blkdev_read_folio,
+	.readahead		= blkdev_readahead,
+	.writepages		= blkdev_writepages,
+	.is_partially_uptodate  = iomap_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+	.migrate_folio		= filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */
 
 /*
  * for a block special file file_inode(file)->i_size is zero
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b0a22e99bade37..9ee18013b1f2ab 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -15,6 +15,7 @@  if MD
 config BLK_DEV_MD
 	tristate "RAID support"
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
+	select BUFFER_HEAD
 	# BLOCK_LEGACY_AUTOLOAD requirement should be removed
 	# after relevant mdadm enhancements - to make "names=yes"
 	# the default - are widely available.
diff --git a/fs/Kconfig b/fs/Kconfig
index e99830c650336a..366d5d5be2784b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,8 +18,12 @@  config VALIDATE_FS_PARSER
 config FS_IOMAP
 	bool
 
+config BUFFER_HEAD
+	bool
+
 # old blockdev_direct_IO implementation.  Use iomap for new code instead
 config LEGACY_DIRECT_IO
+	depends on BUFFER_HEAD
 	bool
 
 if BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index da21e7d0a1cf37..3cd6aa1d2ce387 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -18,7 +18,7 @@  obj-y :=	open.o read_write.o file_table.o super.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o
 
-obj-$(CONFIG_BLOCK)		+= buffer.o mpage.o
+obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
 obj-$(CONFIG_LEGACY_DIRECT_IO)	+= direct-io.o
 obj-y				+= notify/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 44738fed66251f..1b97058f0c4a92 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -2,6 +2,7 @@ 
 config ADFS_FS
 	tristate "ADFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index 962b86374e1c15..1ae432d266c32f 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -2,6 +2,7 @@ 
 config AFFS_FS
 	tristate "Amiga FFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select LEGACY_DIRECT_IO
 	help
 	  The Fast File System (FFS) is the common file system used on hard
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 9550b6462b8147..5fcfc4024ffe6f 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -2,6 +2,7 @@ 
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index 3a757805b58568..8e7ef866b62a62 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -2,6 +2,7 @@ 
 config BFS_FS
 	tristate "BFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 2df1bac8b375b1..0833e533df9d53 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -2,6 +2,7 @@ 
 config EFS_FS
 	tristate "EFS file system support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
 	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
index 147edeb044691d..cbeca8e44d9b38 100644
--- a/fs/exfat/Kconfig
+++ b/fs/exfat/Kconfig
@@ -2,6 +2,7 @@ 
 
 config EXFAT_FS
 	tristate "exFAT filesystem support"
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 77393fda99af09..74d98965902e16 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config EXT2_FS
 	tristate "Second extended fs support"
+	select BUFFER_HEAD
 	select FS_IOMAP
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 86699c8cab281c..e20d59221fc05b 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -28,6 +28,7 @@  config EXT3_FS_SECURITY
 
 config EXT4_FS
 	tristate "The Extended 4 (ext4) filesystem"
+	select BUFFER_HEAD
 	select JBD2
 	select CRC16
 	select CRYPTO
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 03ef087537c7c4..68a1e23e1557c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -2,6 +2,7 @@ 
 config F2FS_FS
 	tristate "F2FS filesystem support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	select CRYPTO
 	select CRYPTO_CRC32
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index afe83b4e717280..25fae1c83725bc 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config FAT_FS
 	tristate
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 0e2fc08f7de492..912107ebea6f40 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -2,6 +2,7 @@ 
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
 	  file system format.  VERITAS VxFS(TM) is the standard file system
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 03c966840422ec..be7f87a8e11ae1 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config GFS2_FS
 	tristate "GFS2 file system support"
+	select BUFFER_HEAD
 	select FS_POSIX_ACL
 	select CRC32
 	select LIBCRC32C
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index d985066006d588..5ea5cd8ecea9c0 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -2,6 +2,7 @@ 
 config HFS_FS
 	tristate "Apple Macintosh file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 8034e7827a690b..8ce4a33a9ac788 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -2,6 +2,7 @@ 
 config HFSPLUS_FS
 	tristate "Apple Extended HFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	select NLS_UTF8
 	select LEGACY_DIRECT_IO
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index ec975f4668775f..ac1e9318e65a4a 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@ 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select FS_IOMAP
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3d5042efda202a..336a9d4542c97a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -42,6 +42,12 @@  static inline struct iomap_page *to_iomap_page(struct folio *folio)
 	return NULL;
 }
 
+static inline bool iomap_use_buffer_heads(const struct iomap *iomap)
+{
+	return IS_ENABLED(CONFIG_BUFFER_HEAD) &&
+		(iomap->flags & IOMAP_F_BUFFER_HEAD);
+}
+
 static struct bio_set iomap_ioend_bioset;
 
 static struct iomap_page *
@@ -683,7 +689,7 @@  static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 
 	if (srcmap->type == IOMAP_INLINE)
 		status = iomap_write_begin_inline(iter, folio);
-	else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
+	else if (iomap_use_buffer_heads(srcmap))
 		status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
 	else
 		status = __iomap_write_begin(iter, pos, len, folio);
@@ -753,7 +759,7 @@  static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
 
 	if (srcmap->type == IOMAP_INLINE) {
 		ret = iomap_write_end_inline(iter, folio, pos, copied);
-	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+	} else if (iomap_use_buffer_heads(srcmap)) {
 		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
 				copied, &folio->page, NULL);
 	} else {
@@ -1256,7 +1262,7 @@  static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
 	loff_t length = iomap_length(iter);
 	int ret;
 
-	if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
+	if (iomap_use_buffer_heads(&iter->iomap)) {
 		ret = __block_write_begin_int(folio, iter->pos, length, NULL,
 					      &iter->iomap);
 		if (ret)
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
index 08ffd37b9bb8f6..51434f2a471b0f 100644
--- a/fs/isofs/Kconfig
+++ b/fs/isofs/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config ISO9660_FS
 	tristate "ISO 9660 CDROM file system support"
+	select BUFFER_HEAD
 	help
 	  This is the standard file system used on CD-ROMs.  It was previously
 	  known as "High Sierra File System" and is called "hsfs" on other
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 51e856f0e4b8d6..17488440eef1a9 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config JFS_FS
 	tristate "JFS filesystem support"
+	select BUFFER_HEAD
 	select NLS
 	select CRC32
 	select LEGACY_DIRECT_IO
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index de2003974ff0d0..90ddfad2a75e8f 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -2,6 +2,7 @@ 
 config MINIX_FS
 	tristate "Minix file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  Minix is a simple operating system used in many classes about OS's.
 	  The minix file system (method to organize files on a hard disk
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 7d59567465e121..7dae168e346e30 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config NILFS2_FS
 	tristate "NILFS2 file system support"
+	select BUFFER_HEAD
 	select CRC32
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index f93e69a612833f..7b2509741735a9 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config NTFS_FS
 	tristate "NTFS file system support"
+	select BUFFER_HEAD
 	select NLS
 	help
 	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
index 96cc236f7f7bd3..cdfdf51e55d797 100644
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config NTFS3_FS
 	tristate "NTFS Read-Write file system support"
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 304d12186ccd38..4a8288f7ee8697 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -2,6 +2,7 @@ 
 config OCFS2_FS
 	tristate "OCFS2 file system support"
 	depends on INET && SYSFS && CONFIGFS_FS
+	select BUFFER_HEAD
 	select JBD2
 	select CRC32
 	select QUOTA
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
index 42b2ec35a05bfb..8470f6c3e64e6a 100644
--- a/fs/omfs/Kconfig
+++ b/fs/omfs/Kconfig
@@ -2,6 +2,7 @@ 
 config OMFS_FS
 	tristate "SonicBlue Optimized MPEG File System support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select CRC_ITU_T
 	help
 	  This is the proprietary file system used by the Rio Karma music
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index 45b5b98376c436..a2eb826e76c602 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -2,6 +2,7 @@ 
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
index 6a9d6bce158622..8e865d72204e75 100644
--- a/fs/qnx6/Kconfig
+++ b/fs/qnx6/Kconfig
@@ -2,6 +2,7 @@ 
 config QNX6FS_FS
 	tristate "QNX6 file system support (read only)"
 	depends on BLOCK && CRC32
+	select BUFFER_HEAD
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 6 (also called QNX RTP).
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 4d22ecfe0fab65..0e6fe26458fede 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config REISERFS_FS
 	tristate "Reiserfs support (deprecated)"
+	select BUFFER_HEAD
 	select CRC32
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
index b4e23e03fbeba3..67b3f90afbfd67 100644
--- a/fs/sysv/Kconfig
+++ b/fs/sysv/Kconfig
@@ -2,6 +2,7 @@ 
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  SCO, Xenix and Coherent are commercial Unix systems for Intel
 	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 82e8bfa2dfd989..8f7ce30d47fdce 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 config UDF_FS
 	tristate "UDF file system support"
+	select BUFFER_HEAD
 	select CRC_ITU_T
 	select NLS
 	select LEGACY_DIRECT_IO
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 6d30adb6b890fc..9301e7ecd09210 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -2,6 +2,7 @@ 
 config UFS_FS
 	tristate "UFS file system support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 0fcc16b7f02bb4..1ef3915cacfae2 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -16,8 +16,6 @@ 
 #include <linux/wait.h>
 #include <linux/atomic.h>
 
-#ifdef CONFIG_BLOCK
-
 enum bh_state_bits {
 	BH_Uptodate,	/* Contains valid data */
 	BH_Dirty,	/* Is dirty */
@@ -196,7 +194,6 @@  void mark_buffer_write_io_error(struct buffer_head *bh);
 void touch_buffer(struct buffer_head *bh);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
-bool try_to_free_buffers(struct folio *);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
@@ -207,10 +204,6 @@  void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
-int inode_has_buffers(struct inode *);
-void invalidate_inode_buffers(struct inode *);
-int remove_inode_buffers(struct inode *inode);
-int sync_mapping_buffers(struct address_space *mapping);
 void clean_bdev_aliases(struct block_device *bdev, sector_t block,
 			sector_t len);
 static inline void clean_bdev_bh_alias(struct buffer_head *bh)
@@ -230,9 +223,6 @@  void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
 struct buffer_head *__bread_gfp(struct block_device *,
 				sector_t block, unsigned size, gfp_t gfp);
-void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(void);
-bool has_bh_in_lru(int cpu, void *dummy);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
@@ -248,8 +238,6 @@  int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
 void __bh_read_batch(int nr, struct buffer_head *bhs[],
 		     blk_opf_t op_flags, bool force_lock);
 
-extern int buffer_heads_over_limit;
-
 /*
  * Generic address_space_operations implementations for buffer_head-backed
  * address_spaces.
@@ -294,8 +282,6 @@  extern int buffer_migrate_folio_norefs(struct address_space *,
 #define buffer_migrate_folio_norefs NULL
 #endif
 
-void buffer_init(void);
-
 /*
  * inline definitions
  */
@@ -455,7 +441,20 @@  __bread(struct block_device *bdev, sector_t block, unsigned size)
 
 bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
 
-#else /* CONFIG_BLOCK */
+#ifdef CONFIG_BUFFER_HEAD
+
+void buffer_init(void);
+bool try_to_free_buffers(struct folio *folio);
+int inode_has_buffers(struct inode *inode);
+void invalidate_inode_buffers(struct inode *inode);
+int remove_inode_buffers(struct inode *inode);
+int sync_mapping_buffers(struct address_space *mapping);
+void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(void);
+bool has_bh_in_lru(int cpu, void *dummy);
+extern int buffer_heads_over_limit;
+
+#else /* CONFIG_BUFFER_HEAD */
 
 static inline void buffer_init(void) {}
 static inline bool try_to_free_buffers(struct folio *folio) { return true; }
@@ -463,9 +462,10 @@  static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus(void) {}
 static inline void invalidate_bh_lrus_cpu(void) {}
 static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
 #define buffer_heads_over_limit 0
 
-#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BUFFER_HEAD */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 7f4dfbdf12a6f1..a0503f387104a1 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -12,6 +12,7 @@ 
 
 #define RWBS_LEN	8
 
+#ifdef CONFIG_BUFFER_HEAD
 DECLARE_EVENT_CLASS(block_buffer,
 
 	TP_PROTO(struct buffer_head *bh),
@@ -61,6 +62,7 @@  DEFINE_EVENT(block_buffer, block_dirty_buffer,
 
 	TP_ARGS(bh)
 );
+#endif /* CONFIG_BUFFER_HEAD */
 
 /**
  * block_rq_requeue - place block IO request back on a queue
diff --git a/mm/migrate.c b/mm/migrate.c
index db3f154446af4e..9f9a81d43fc8f2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -692,7 +692,7 @@  int migrate_folio(struct address_space *mapping, struct folio *dst,
 }
 EXPORT_SYMBOL(migrate_folio);
 
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 							enum migrate_mode mode)
@@ -850,7 +850,7 @@  int buffer_migrate_folio_norefs(struct address_space *mapping,
 	return __buffer_migrate_folio(mapping, dst, src, mode, true);
 }
 EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
-#endif
+#endif /* CONFIG_BUFFER_HEAD */
 
 int filemap_migrate_folio(struct address_space *mapping,
 		struct folio *dst, struct folio *src, enum migrate_mode mode)