diff mbox series

[v2,5/9] iomap: Support arbitrarily many blocks per page

Message ID 20200910234707.5504-6-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series THP iomap patches for 5.10 | expand

Commit Message

Matthew Wilcox Sept. 10, 2020, 11:47 p.m. UTC
Size the uptodate array dynamically to support larger pages in the
page cache.  With a 64kB page, we're only saving 8 bytes per page today,
but with a 2MB maximum page size, we'd have to allocate more than 4kB
per page.  Add a few debugging assertions.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/iomap/buffered-io.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

Comments

Christoph Hellwig Sept. 11, 2020, 5:36 a.m. UTC | #1
On Fri, Sep 11, 2020 at 12:47:03AM +0100, Matthew Wilcox (Oracle) wrote:
> Size the uptodate array dynamically to support larger pages in the
> page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> but with a 2MB maximum page size, we'd have to allocate more than 4kB
> per page.  Add a few debugging assertions.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>
Darrick J. Wong Sept. 17, 2020, 10 p.m. UTC | #2
On Fri, Sep 11, 2020 at 12:47:03AM +0100, Matthew Wilcox (Oracle) wrote:
> Size the uptodate array dynamically to support larger pages in the
> page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> but with a 2MB maximum page size, we'd have to allocate more than 4kB
> per page.  Add a few debugging assertions.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  fs/iomap/buffered-io.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 7fc0e02d27b0..9670c096b83e 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -22,18 +22,25 @@
>  #include "../internal.h"
>  
>  /*
> - * Structure allocated for each page when block size < PAGE_SIZE to track
> - * sub-page uptodate status and I/O completions.
> + * Structure allocated for each page or THP when block size < page size
> + * to track sub-page uptodate status and I/O completions.
>   */
>  struct iomap_page {
>  	atomic_t		read_count;
>  	atomic_t		write_count;
>  	spinlock_t		uptodate_lock;
> -	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
> +	unsigned long		uptodate[];
>  };
>  
>  static inline struct iomap_page *to_iomap_page(struct page *page)
>  {
> +	/*
> +	 * per-block data is stored in the head page.  Callers should
> +	 * not be dealing with tail pages (and if they are, they can
> +	 * call thp_head() first.
> +	 */
> +	VM_BUG_ON_PGFLAGS(PageTail(page), page);
> +
>  	if (page_has_private(page))
>  		return (struct iomap_page *)page_private(page);
>  	return NULL;
> @@ -45,11 +52,13 @@ static struct iomap_page *
>  iomap_page_create(struct inode *inode, struct page *page)
>  {
>  	struct iomap_page *iop = to_iomap_page(page);
> +	unsigned int nr_blocks = i_blocks_per_page(inode, page);
>  
> -	if (iop || i_blocks_per_page(inode, page) <= 1)
> +	if (iop || nr_blocks <= 1)
>  		return iop;
>  
> -	iop = kzalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
> +	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
> +			GFP_NOFS | __GFP_NOFAIL);
>  	spin_lock_init(&iop->uptodate_lock);
>  	attach_page_private(page, iop);
>  	return iop;
> @@ -59,11 +68,14 @@ static void
>  iomap_page_release(struct page *page)
>  {
>  	struct iomap_page *iop = detach_page_private(page);
> +	unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
>  
>  	if (!iop)
>  		return;
>  	WARN_ON_ONCE(atomic_read(&iop->read_count));
>  	WARN_ON_ONCE(atomic_read(&iop->write_count));
> +	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> +			PageUptodate(page));
>  	kfree(iop);
>  }
>  
> -- 
> 2.28.0
>
Qian Cai Sept. 22, 2020, 4:23 p.m. UTC | #3
On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> Size the uptodate array dynamically to support larger pages in the
> page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> but with a 2MB maximum page size, we'd have to allocate more than 4kB
> per page.  Add a few debugging assertions.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Some syscall fuzzing will trigger this on powerpc:

.config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config

[ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270
[ 8805.895376][T445431] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_spapr_tce vfio vfio_spapr_eeh loop kvm_hv kvm ip_tables x_tables sd_mod bnx2x tg3 ahci libahci mdio libphy firmware_class libata dm_mirror dm_region_hash dm_log dm_mod
[ 8805.895521][T445431] CPU: 61 PID: 445431 Comm: trinity-c61 Not tainted 5.9.0-rc6-next-20200922 #3
[ 8805.895551][T445431] NIP:  c0000000004734a0 LR: c00000000047335c CTR: 0000000000000000
[ 8805.895571][T445431] REGS: c000001fe5427620 TRAP: 0700   Not tainted  (5.9.0-rc6-next-20200922)
[ 8805.895609][T445431] MSR:  9000000000029033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 44002244  XER: 20040000
[ 8805.895671][T445431] CFAR: c000000000473394 IRQMASK: 0 
[ 8805.895671][T445431] GPR00: c00000000029b698 c000001fe54278b0 c000000005687e00 0000000000000000 
[ 8805.895671][T445431] GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
[ 8805.895671][T445431] GPR08: c00c000002c4b5c7 007fff8000000015 0000000000000001 c000000005f5e028 
[ 8805.895671][T445431] GPR12: 0000000000002000 c000001ffffc0580 0000000010037be0 00000000109ec8e0 
[ 8805.895671][T445431] GPR16: 00000000109eccb0 c000001fe54279c0 c000001fe5427a40 0000000000000000 
[ 8805.895671][T445431] GPR20: c000000000a4aa80 0000000000000008 0000000000000000 000000000000000f 
[ 8805.895671][T445431] GPR24: ffffffffffffffff c000001fe54279b8 c000001fe5427940 0000000000000001 
[ 8805.895671][T445431] GPR28: 0000000000000001 c0000014dd4f9128 0000000000000000 c00c000006273a80 
[ 8805.895989][T445431] NIP [c0000000004734a0] iomap_page_release+0x250/0x270
[ 8805.896016][T445431] LR [c00000000047335c] iomap_page_release+0x10c/0x270
[ 8805.896041][T445431] Call Trace:
[ 8805.896065][T445431] [c000001fe54278b0] [c000001fe5427940] 0xc000001fe5427940 (unreliable)
[ 8805.896101][T445431] [c000001fe54278f0] [c00000000029b698] truncate_cleanup_page+0x188/0x2e0
[ 8805.896144][T445431] [c000001fe5427920] [c00000000029c61c] truncate_inode_pages_range+0x23c/0x9f0
[ 8805.896187][T445431] [c000001fe5427b40] [c00000000029ceec] truncate_pagecache+0x5c/0x90
[ 8805.896229][T445431] [c000001fe5427b80] [c000000000569858] xfs_setattr_size+0xb8/0x5d0
[ 8805.896270][T445431] [c000001fe5427c10] [c00000000056a26c] xfs_vn_setattr+0x8c/0x130
[ 8805.896321][T445431] [c000001fe5427c60] [c0000000003dbe60] notify_change+0x390/0x5b0
[ 8805.896372][T445431] [c000001fe5427cd0] [c0000000003a5294] do_truncate+0x94/0x130
[ 8805.896412][T445431] [c000001fe5427d60] [c0000000003a57a8] do_sys_ftruncate+0xe8/0x160
[ 8805.896455][T445431] [c000001fe5427dc0] [c00000000002a458] system_call_exception+0xf8/0x1d0
[ 8805.896496][T445431] [c000001fe5427e20] [c00000000000d0a8] system_call_common+0xe8/0x218
[ 8805.896544][T445431] Instruction dump:
[ 8805.896569][T445431] 3c82fb3a 388490c8 4be65be1 60000000 0fe00000 60000000 60000000 60000000 
[ 8805.896623][T445431] 0fe00000 4bfffea8 60000000 60000000 <0fe00000> 4bfffef4 60000000 60000000 
[ 8805.896678][T445431] CPU: 61 PID: 445431 Comm: trinity-c61 Not tainted 5.9.0-rc6-next-20200922 #3
[ 8805.896716][T445431] Call Trace:
[ 8805.896750][T445431] [c000001fe5427410] [c0000000006440e8] dump_stack+0xec/0x144 (unreliable)
[ 8805.896796][T445431] [c000001fe5427450] [c0000000000b0a24] __warn+0xc4/0x144
[ 8805.896837][T445431] [c000001fe54274e0] [c000000000642cf8] report_bug+0x108/0x1f0
[ 8805.896878][T445431] [c000001fe5427580] [c000000000021714] program_check_exception+0x104/0x2e0
[ 8805.896922][T445431] [c000001fe54275b0] [c000000000009664] program_check_common_virt+0x2c4/0x310
[ 8805.896965][T445431] --- interrupt: 700 at iomap_page_release+0x250/0x270
[ 8805.896965][T445431]     LR = iomap_page_release+0x10c/0x270
[ 8805.897018][T445431] [c000001fe54278b0] [c000001fe5427940] 0xc000001fe5427940 (unreliable)
[ 8805.897053][T445431] [c000001fe54278f0] [c00000000029b698] truncate_cleanup_page+0x188/0x2e0
[ 8805.897104][T445431] [c000001fe5427920] [c00000000029c61c] truncate_inode_pages_range+0x23c/0x9f0
[ 8805.897146][T445431] [c000001fe5427b40] [c00000000029ceec] truncate_pagecache+0x5c/0x90
[ 8805.897186][T445431] [c000001fe5427b80] [c000000000569858] xfs_setattr_size+0xb8/0x5d0
[ 8805.897226][T445431] [c000001fe5427c10] [c00000000056a26c] xfs_vn_setattr+0x8c/0x130
[ 8805.897266][T445431] [c000001fe5427c60] [c0000000003dbe60] notify_change+0x390/0x5b0
[ 8805.897307][T445431] [c000001fe5427cd0] [c0000000003a5294] do_truncate+0x94/0x130
[ 8805.897329][T445431] [c000001fe5427d60] [c0000000003a57a8] do_sys_ftruncate+0xe8/0x160
[ 8805.897369][T445431] [c000001fe5427dc0] [c00000000002a458] system_call_exception+0xf8/0x1d0
[ 8805.897410][T445431] [c000001fe5427e20] [c00000000000d0a8] system_call_common+0xe8/0x218
[ 8805.897448][T445431] irq event stamp: 360936
[ 8805.897475][T445431] hardirqs last  enabled at (360935): [<c0000000002bd5b8>] dec_zone_page_state+0x78/0xb0
[ 8805.897515][T445431] hardirqs last disabled at (360936): [<c00000000000965c>] program_check_common_virt+0x2bc/0x310
[ 8805.897556][T445431] softirqs last  enabled at (342438): [<c0000000008d3d38>] __do_softirq+0x378/0x6e4
[ 8805.897597][T445431] softirqs last disabled at (342433): [<c0000000000bc838>] irq_exit+0x118/0x140

> ---
>  fs/iomap/buffered-io.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 7fc0e02d27b0..9670c096b83e 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -22,18 +22,25 @@
>  #include "../internal.h"
>  
>  /*
> - * Structure allocated for each page when block size < PAGE_SIZE to track
> - * sub-page uptodate status and I/O completions.
> + * Structure allocated for each page or THP when block size < page size
> + * to track sub-page uptodate status and I/O completions.
>   */
>  struct iomap_page {
>  	atomic_t		read_count;
>  	atomic_t		write_count;
>  	spinlock_t		uptodate_lock;
> -	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
> +	unsigned long		uptodate[];
>  };
>  
>  static inline struct iomap_page *to_iomap_page(struct page *page)
>  {
> +	/*
> +	 * per-block data is stored in the head page.  Callers should
> +	 * not be dealing with tail pages (and if they are, they can
> +	 * call thp_head() first.
> +	 */
> +	VM_BUG_ON_PGFLAGS(PageTail(page), page);
> +
>  	if (page_has_private(page))
>  		return (struct iomap_page *)page_private(page);
>  	return NULL;
> @@ -45,11 +52,13 @@ static struct iomap_page *
>  iomap_page_create(struct inode *inode, struct page *page)
>  {
>  	struct iomap_page *iop = to_iomap_page(page);
> +	unsigned int nr_blocks = i_blocks_per_page(inode, page);
>  
> -	if (iop || i_blocks_per_page(inode, page) <= 1)
> +	if (iop || nr_blocks <= 1)
>  		return iop;
>  
> -	iop = kzalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
> +	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
> +			GFP_NOFS | __GFP_NOFAIL);
>  	spin_lock_init(&iop->uptodate_lock);
>  	attach_page_private(page, iop);
>  	return iop;
> @@ -59,11 +68,14 @@ static void
>  iomap_page_release(struct page *page)
>  {
>  	struct iomap_page *iop = detach_page_private(page);
> +	unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
>  
>  	if (!iop)
>  		return;
>  	WARN_ON_ONCE(atomic_read(&iop->read_count));
>  	WARN_ON_ONCE(atomic_read(&iop->write_count));
> +	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> +			PageUptodate(page));
>  	kfree(iop);
>  }
>
Matthew Wilcox Sept. 22, 2020, 5:05 p.m. UTC | #4
On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > Size the uptodate array dynamically to support larger pages in the
> > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > per page.  Add a few debugging assertions.
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> 
> Some syscall fuzzing will trigger this on powerpc:
> 
> .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> 
> [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270

Well, I'm glad it triggered.  That warning is:
        WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
                        PageUptodate(page));
so there was definitely a problem of some kind.

truncate_cleanup_page() calls
do_invalidatepage() calls
iomap_invalidatepage() calls
iomap_page_release()

Is this the first warning?  I'm wondering if maybe there was an I/O error
earlier which caused PageUptodate to get cleared again.  If it's easy to
reproduce, perhaps you could try something like this?

+void dump_iomap_page(struct page *page, const char *reason)
+{
+       struct iomap_page *iop = to_iomap_page(page);
+       unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
+
+       dump_page(page, reason);
+       if (iop)
+               printk("iop:reads %d writes %d uptodate %*pb\n",
+                               atomic_read(&iop->read_bytes_pending),
+                               atomic_read(&iop->write_bytes_pending),
+                               nr_blocks, iop->uptodate);
+       else
+               printk("iop:none\n");
+}

and then do something like:

	if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page))
		dump_iomap_page(page, NULL);
Qian Cai Sept. 22, 2020, 5:25 p.m. UTC | #5
On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote:
> On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > > Size the uptodate array dynamically to support larger pages in the
> > > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > > per page.  Add a few debugging assertions.
> > > 
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > 
> > Some syscall fuzzing will trigger this on powerpc:
> > 
> > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> > 
> > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-
> > io.c:78 iomap_page_release+0x250/0x270
> 
> Well, I'm glad it triggered.  That warning is:
>         WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
>                         PageUptodate(page));
> so there was definitely a problem of some kind.
> 
> truncate_cleanup_page() calls
> do_invalidatepage() calls
> iomap_invalidatepage() calls
> iomap_page_release()
> 
> Is this the first warning?  I'm wondering if maybe there was an I/O error
> earlier which caused PageUptodate to get cleared again.  If it's easy to
> reproduce, perhaps you could try something like this?

Yes, this is the first warning. BTW, I did run the reproducer of a805c111650c
("iomap: fix WARN_ON_ONCE() from unprivileged users") earlier, so I am wondering
if this is just another victim WARN_ON_ONCE() from it.

> 
> +void dump_iomap_page(struct page *page, const char *reason)
> +{
> +       struct iomap_page *iop = to_iomap_page(page);
> +       unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
> +
> +       dump_page(page, reason);
> +       if (iop)
> +               printk("iop:reads %d writes %d uptodate %*pb\n",
> +                               atomic_read(&iop->read_bytes_pending),
> +                               atomic_read(&iop->write_bytes_pending),
> +                               nr_blocks, iop->uptodate);
> +       else
> +               printk("iop:none\n");
> +}
> 
> and then do something like:
> 
> 	if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page))
> 		dump_iomap_page(page, NULL);
>
Qian Cai Sept. 23, 2020, 1:06 a.m. UTC | #6
On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote:
> On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > > Size the uptodate array dynamically to support larger pages in the
> > > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > > per page.  Add a few debugging assertions.
> > > 
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > 
> > Some syscall fuzzing will trigger this on powerpc:
> > 
> > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> > 
> > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-
> > io.c:78 iomap_page_release+0x250/0x270
> 
> Well, I'm glad it triggered.  That warning is:
>         WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
>                         PageUptodate(page));
> so there was definitely a problem of some kind.
> 
> truncate_cleanup_page() calls
> do_invalidatepage() calls
> iomap_invalidatepage() calls
> iomap_page_release()
> 
> Is this the first warning?  I'm wondering if maybe there was an I/O error
> earlier which caused PageUptodate to get cleared again.  If it's easy to
> reproduce, perhaps you could try something like this?
> 
> +void dump_iomap_page(struct page *page, const char *reason)
> +{
> +       struct iomap_page *iop = to_iomap_page(page);
> +       unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
> +
> +       dump_page(page, reason);
> +       if (iop)
> +               printk("iop:reads %d writes %d uptodate %*pb\n",
> +                               atomic_read(&iop->read_bytes_pending),
> +                               atomic_read(&iop->write_bytes_pending),
> +                               nr_blocks, iop->uptodate);
> +       else
> +               printk("iop:none\n");
> +}
> 
> and then do something like:
> 
> 	if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page))
> 		dump_iomap_page(page, NULL);

This:

[ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c
[ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2"
[ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru)
[ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8
[ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000
[ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000
[ 1683.158506][T164965] iop:none
Matthew Wilcox Sept. 23, 2020, 2:48 a.m. UTC | #7
On Tue, Sep 22, 2020 at 09:06:03PM -0400, Qian Cai wrote:
> On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote:
> > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > > > Size the uptodate array dynamically to support larger pages in the
> > > > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > > > per page.  Add a few debugging assertions.
> > > > 
> > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Some syscall fuzzing will trigger this on powerpc:
> > > 
> > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> > > 
> > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-
> > > io.c:78 iomap_page_release+0x250/0x270
> > 
> > Well, I'm glad it triggered.  That warning is:
> >         WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> >                         PageUptodate(page));
> > so there was definitely a problem of some kind.
> > 
> > truncate_cleanup_page() calls
> > do_invalidatepage() calls
> > iomap_invalidatepage() calls
> > iomap_page_release()
> > 
> > Is this the first warning?  I'm wondering if maybe there was an I/O error
> > earlier which caused PageUptodate to get cleared again.  If it's easy to
> > reproduce, perhaps you could try something like this?
> > 
> > +void dump_iomap_page(struct page *page, const char *reason)
> > +{
> > +       struct iomap_page *iop = to_iomap_page(page);
> > +       unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
> > +
> > +       dump_page(page, reason);
> > +       if (iop)
> > +               printk("iop:reads %d writes %d uptodate %*pb\n",
> > +                               atomic_read(&iop->read_bytes_pending),
> > +                               atomic_read(&iop->write_bytes_pending),
> > +                               nr_blocks, iop->uptodate);
> > +       else
> > +               printk("iop:none\n");
> > +}
> > 
> > and then do something like:
> > 
> > 	if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page))
> > 		dump_iomap_page(page, NULL);
> 
> This:
> 
> [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c
> [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2"
> [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru)
> [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8
> [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000
> [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000
> [ 1683.158506][T164965] iop:none

Oh, I'm a fool.  This is after the call to detach_page_private() so
page->private is NULL and we don't get the iop dumped.

Nevertheless, this is interesting.  Somehow, the page is marked Uptodate,
but the bitmap is deemed not full.  There are three places where we set
an iomap page Uptodate:

1.      if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
                SetPageUptodate(page);

2.      if (page_has_private(page))
                iomap_iop_set_range_uptodate(page, off, len);
        else
                SetPageUptodate(page);

3.      BUG_ON(page->index);
...
        SetPageUptodate(page);

It can't be #2 because the page has an iop.  It can't be #3 because the
page->index is not 0.  So at some point in the past, the bitmap was full.

I don't think it's possible for inode->i_blksize to change, and you
aren't running with THPs, so it's definitely not possible for thp_size()
to change.  So i_blocks_per_page() isn't going to change.

We seem to have allocated enough memory for ->iop because that's also
based on i_blocks_per_page().

I'm out of ideas.  Maybe I'll wake up with a better idea in the morning.
I've been trying to reproduce this on x86 with a 1kB block size
filesystem, and haven't been able to yet.  Maybe I'll try to setup a
powerpc cross-compilation environment tomorrow.
Darrick J. Wong Sept. 23, 2020, 5 a.m. UTC | #8
On Wed, Sep 23, 2020 at 03:48:59AM +0100, Matthew Wilcox wrote:
> On Tue, Sep 22, 2020 at 09:06:03PM -0400, Qian Cai wrote:
> > On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote:
> > > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> > > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > > > > Size the uptodate array dynamically to support larger pages in the
> > > > > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > > > > per page.  Add a few debugging assertions.
> > > > > 
> > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > > > 
> > > > Some syscall fuzzing will trigger this on powerpc:
> > > > 
> > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> > > > 
> > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-
> > > > io.c:78 iomap_page_release+0x250/0x270
> > > 
> > > Well, I'm glad it triggered.  That warning is:
> > >         WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> > >                         PageUptodate(page));
> > > so there was definitely a problem of some kind.
> > > 
> > > truncate_cleanup_page() calls
> > > do_invalidatepage() calls
> > > iomap_invalidatepage() calls
> > > iomap_page_release()
> > > 
> > > Is this the first warning?  I'm wondering if maybe there was an I/O error
> > > earlier which caused PageUptodate to get cleared again.  If it's easy to
> > > reproduce, perhaps you could try something like this?
> > > 
> > > +void dump_iomap_page(struct page *page, const char *reason)
> > > +{
> > > +       struct iomap_page *iop = to_iomap_page(page);
> > > +       unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
> > > +
> > > +       dump_page(page, reason);
> > > +       if (iop)
> > > +               printk("iop:reads %d writes %d uptodate %*pb\n",
> > > +                               atomic_read(&iop->read_bytes_pending),
> > > +                               atomic_read(&iop->write_bytes_pending),
> > > +                               nr_blocks, iop->uptodate);
> > > +       else
> > > +               printk("iop:none\n");
> > > +}
> > > 
> > > and then do something like:
> > > 
> > > 	if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page))
> > > 		dump_iomap_page(page, NULL);
> > 
> > This:
> > 
> > [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c
> > [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2"
> > [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru)
> > [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8
> > [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000
> > [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000
> > [ 1683.158506][T164965] iop:none
> 
> Oh, I'm a fool.  This is after the call to detach_page_private() so
> page->private is NULL and we don't get the iop dumped.
> 
> Nevertheless, this is interesting.  Somehow, the page is marked Uptodate,
> but the bitmap is deemed not full.  There are three places where we set
> an iomap page Uptodate:
> 
> 1.      if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
>                 SetPageUptodate(page);
> 
> 2.      if (page_has_private(page))
>                 iomap_iop_set_range_uptodate(page, off, len);
>         else
>                 SetPageUptodate(page);
> 
> 3.      BUG_ON(page->index);
> ...
>         SetPageUptodate(page);
> 
> It can't be #2 because the page has an iop.  It can't be #3 because the
> page->index is not 0.  So at some point in the past, the bitmap was full.
> 
> I don't think it's possible for inode->i_blksize to change, and you
> aren't running with THPs, so it's definitely not possible for thp_size()
> to change.  So i_blocks_per_page() isn't going to change.
> 
> We seem to have allocated enough memory for ->iop because that's also
> based on i_blocks_per_page().
> 
> I'm out of ideas.  Maybe I'll wake up with a better idea in the morning.
> I've been trying to reproduce this on x86 with a 1kB block size
> filesystem, and haven't been able to yet.  Maybe I'll try to setup a
> powerpc cross-compilation environment tomorrow.

FWIW I managed to reproduce it with the following fstests configuration
on a 1k block size fs on a x86 machinE:

SECTION      -- -no-sections-
FSTYP        -- xfs
MKFS_OPTIONS --  -m reflink=1,rmapbt=1 -i sparse=1 -b size=1024
MOUNT_OPTIONS --  -o usrquota,grpquota,prjquota
HOST_OPTIONS -- local.config
CHECK_OPTIONS -- -g auto
XFS_MKFS_OPTIONS -- -bsize=4096
TIME_FACTOR  -- 1
LOAD_FACTOR  -- 1
TEST_DIR     -- /mnt
TEST_DEV     -- /dev/sde
SCRATCH_DEV  -- /dev/sdd
SCRATCH_MNT  -- /opt
OVL_UPPER    -- ovl-upper
OVL_LOWER    -- ovl-lower
OVL_WORK     -- ovl-work
KERNEL       -- 5.9.0-rc4-djw

The kernel is more or less iomap-for-next.

--D
Qian Cai Sept. 23, 2020, 4:55 p.m. UTC | #9
On Wed, 2020-09-23 at 03:48 +0100, Matthew Wilcox wrote:
> I'm out of ideas.  Maybe I'll wake up with a better idea in the morning.
> I've been trying to reproduce this on x86 with a 1kB block size
> filesystem, and haven't been able to yet.  Maybe I'll try to setup a
> powerpc cross-compilation environment tomorrow.

Another data point is that this can be reproduced on both arm64 and powerpc
(both have 64k-size pages) by running this LTP test case:

https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/preadv2/preadv203.c

[ 2397.723289] preadv203 (80525): drop_caches: 3
[ 2400.796402] XFS (loop0): Unmounting Filesystem
[ 2401.431293] ------------[ cut here ]------------
[ 2401.431411] WARNING: CPU: 0 PID: 80501 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270
[ 2401.431435] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_spapr_tce vfio vfio_spapr_eeh loop kvm_hv kvm ip_tables x_tables sd_mod bnx2x mdio ahci libahci tg3 libphy libata firmware_class dm_mirror dm_reg
ion_hash dm_log dm_mod
[ 2401.431534] CPU: 0 PID: 80501 Comm: preadv203 Not tainted 5.9.0-rc6-next-20200923+ #4
[ 2401.431567] NIP:  c000000000473b30 LR: c0000000004739ec CTR: c000000000473e80
[ 2401.431590] REGS: c0000011a7bbf7a0 TRAP: 0700   Not tainted  (5.9.0-rc6-next-20200923+)
[ 2401.431613] MSR:  9000000000029033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 44000244  XER: 20040000
[ 2401.431655] CFAR: c000000000473a24 IRQMASK: 0 
               GPR00: c00000000029b6a8 c0000011a7bbfa30 c000000007e87e00 0000000000000005 
               GPR04: 0000000000000000 0000000000000005 0000000000000005 0000000000000000 
               GPR08: c00c000806f1f587 087fff8000000037 0000000000000001 c00000000875e028 
               GPR12: c000000000473e80 c00000000c180000 0000000000000000 0000000000000000 
               GPR16: 0000000000000000 c0000011a7bbfbb0 c0000011a7bbfbc0 0000000000000000 
               GPR20: c000000000a4aad8 0000000000000000 0000000000000000 000000000000000f 
               GPR24: ffffffffffffffff c0000011a7bbfb38 c0000011a7bbfac0 0000000000000000 
               GPR28: 0000000000000001 c000000ea4fe0a28 0000000000000000 c00c0008071b46c0 
[ 2401.431856] NIP [c000000000473b30] iomap_page_release+0x250/0x270
[ 2401.431878] LR [c0000000004739ec] iomap_page_release+0x10c/0x270
[ 2401.431899] Call Trace:
[ 2401.431926] [c0000011a7bbfa30] [c0000011a7bbfac0] 0xc0000011a7bbfac0 (unreliable)
[ 2401.431962] [c0000011a7bbfa70] [c00000000029b6a8] truncate_cleanup_page+0x188/0x2e0
[ 2401.431987] [c0000011a7bbfaa0] [c00000000029c62c] truncate_inode_pages_range+0x23c/0x9f0
[ 2401.432023] [c0000011a7bbfcc0] [c0000000003d9588] evict+0x1e8/0x200
[ 2401.432055] [c0000011a7bbfd00] [c0000000003c64b0] do_unlinkat+0x2d0/0x3a0
[ 2401.432081] [c0000011a7bbfdc0] [c00000000002a458] system_call_exception+0xf8/0x1d0
[ 2401.432097] [c0000011a7bbfe20] [c00000000000d0a8] system_call_common+0xe8/0x218
[ 2401.432120] Instruction dump:
[ 2401.432140] 3c82f8ba 388490b8 4be655b1 60000000 0fe00000 60000000 60000000 60000000 
[ 2401.432176] 0fe00000 4bfffea8 60000000 60000000 <0fe00000> 4bfffef4 60000000 60000000 
[ 2401.432213] CPU: 0 PID: 80501 Comm: preadv203 Not tainted 5.9.0-rc6-next-20200923+ #4
[ 2401.432245] Call Trace:
[ 2401.432255] [c0000011a7bbf590] [c000000000644778] dump_stack+0xec/0x144 (unreliable)
[ 2401.432284] [c0000011a7bbf5d0] [c0000000000b0a24] __warn+0xc4/0x144
[ 2401.432298] [c0000011a7bbf660] [c000000000643388] report_bug+0x108/0x1f0
[ 2401.432331] [c0000011a7bbf700] [c000000000021714] program_check_exception+0x104/0x2e0
[ 2401.432359] [c0000011a7bbf730] [c000000000009664] program_check_common_virt+0x2c4/0x310
[ 2401.432385] --- interrupt: 700 at iomap_page_release+0x250/0x270
                   LR = iomap_page_release+0x10c/0x270
[ 2401.432420] [c0000011a7bbfa30] [c0000011a7bbfac0] 0xc0000011a7bbfac0 (unreliable)
[ 2401.432446] [c0000011a7bbfa70] [c00000000029b6a8] truncate_cleanup_page+0x188/0x2e0
[ 2401.432470] [c0000011a7bbfaa0] [c00000000029c62c] truncate_inode_pages_range+0x23c/0x9f0
[ 2401.432505] [c0000011a7bbfcc0] [c0000000003d9588] evict+0x1e8/0x200
[ 2401.432528] [c0000011a7bbfd00] [c0000000003c64b0] do_unlinkat+0x2d0/0x3a0
[ 2401.432552] [c0000011a7bbfdc0] [c00000000002a458] system_call_exception+0xf8/0x1d0
[ 2401.432576] [c0000011a7bbfe20] [c00000000000d0a8] system_call_common+0xe8/0x218
[ 2401.432599] irq event stamp: 210662
[ 2401.432619] hardirqs last  enabled at (210661): [<c0000000008d32bc>] _raw_spin_unlock_irq+0x3c/0x70
[ 2401.432652] hardirqs last disabled at (210662): [<c00000000000965c>] program_check_common_virt+0x2bc/0x310
[ 2401.432688] softirqs last  enabled at (210280): [<c000000000547de4>] xfs_buf_find.isra.31+0xb54/0x1060
[ 2401.432722] softirqs last disabled at (210278): [<c0000000005476fc>] xfs_buf_find.isra.31+0x46c/0x1060
Matthew Wilcox Sept. 23, 2020, 6:59 p.m. UTC | #10
On Tue, Sep 22, 2020 at 10:00:01PM -0700, Darrick J. Wong wrote:
> On Wed, Sep 23, 2020 at 03:48:59AM +0100, Matthew Wilcox wrote:
> > On Tue, Sep 22, 2020 at 09:06:03PM -0400, Qian Cai wrote:
> > > On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote:
> > > > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> > > > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > > > > > Size the uptodate array dynamically to support larger pages in the
> > > > > > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > > > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > > > > > per page.  Add a few debugging assertions.
> > > > > > 
> > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > > > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > > > > 
> > > > > Some syscall fuzzing will trigger this on powerpc:
> > > > > 
> > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> > > > > 
> > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-
> > > > > io.c:78 iomap_page_release+0x250/0x270
> > > > 
> > > > Well, I'm glad it triggered.  That warning is:
> > > >         WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> > > >                         PageUptodate(page));
> > > > so there was definitely a problem of some kind.
> > > > 
> > > > truncate_cleanup_page() calls
> > > > do_invalidatepage() calls
> > > > iomap_invalidatepage() calls
> > > > iomap_page_release()
> > > > 
> > > > Is this the first warning?  I'm wondering if maybe there was an I/O error
> > > > earlier which caused PageUptodate to get cleared again.  If it's easy to
> > > > reproduce, perhaps you could try something like this?
> > > > 
> > > > +void dump_iomap_page(struct page *page, const char *reason)
> > > > +{
> > > > +       struct iomap_page *iop = to_iomap_page(page);
> > > > +       unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
> > > > +
> > > > +       dump_page(page, reason);
> > > > +       if (iop)
> > > > +               printk("iop:reads %d writes %d uptodate %*pb\n",
> > > > +                               atomic_read(&iop->read_bytes_pending),
> > > > +                               atomic_read(&iop->write_bytes_pending),
> > > > +                               nr_blocks, iop->uptodate);
> > > > +       else
> > > > +               printk("iop:none\n");
> > > > +}
> > > > 
> > > > and then do something like:
> > > > 
> > > > 	if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page))
> > > > 		dump_iomap_page(page, NULL);
> > > 
> > > This:
> > > 
> > > [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c
> > > [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2"
> > > [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru)
> > > [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8
> > > [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000
> > > [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000
> > > [ 1683.158506][T164965] iop:none
> > 
> > Oh, I'm a fool.  This is after the call to detach_page_private() so
> > page->private is NULL and we don't get the iop dumped.
> > 
> > Nevertheless, this is interesting.  Somehow, the page is marked Uptodate,
> > but the bitmap is deemed not full.  There are three places where we set
> > an iomap page Uptodate:
> > 
> > 1.      if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
> >                 SetPageUptodate(page);
> > 
> > 2.      if (page_has_private(page))
> >                 iomap_iop_set_range_uptodate(page, off, len);
> >         else
> >                 SetPageUptodate(page);
> > 
> > 3.      BUG_ON(page->index);
> > ...
> >         SetPageUptodate(page);
> > 
> > It can't be #2 because the page has an iop.  It can't be #3 because the
> > page->index is not 0.  So at some point in the past, the bitmap was full.
> > 
> > I don't think it's possible for inode->i_blksize to change, and you
> > aren't running with THPs, so it's definitely not possible for thp_size()
> > to change.  So i_blocks_per_page() isn't going to change.
> > 
> > We seem to have allocated enough memory for ->iop because that's also
> > based on i_blocks_per_page().
> > 
> > I'm out of ideas.  Maybe I'll wake up with a better idea in the morning.
> > I've been trying to reproduce this on x86 with a 1kB block size
> > filesystem, and haven't been able to yet.  Maybe I'll try to setup a
> > powerpc cross-compilation environment tomorrow.
> 
> FWIW I managed to reproduce it with the following fstests configuration
> on a 1k block size fs on a x86 machinE:
> 
> SECTION      -- -no-sections-
> FSTYP        -- xfs
> MKFS_OPTIONS --  -m reflink=1,rmapbt=1 -i sparse=1 -b size=1024
> MOUNT_OPTIONS --  -o usrquota,grpquota,prjquota
> HOST_OPTIONS -- local.config
> CHECK_OPTIONS -- -g auto
> XFS_MKFS_OPTIONS -- -bsize=4096
> TIME_FACTOR  -- 1
> LOAD_FACTOR  -- 1
> TEST_DIR     -- /mnt
> TEST_DEV     -- /dev/sde
> SCRATCH_DEV  -- /dev/sdd
> SCRATCH_MNT  -- /opt
> OVL_UPPER    -- ovl-upper
> OVL_LOWER    -- ovl-lower
> OVL_WORK     -- ovl-work
> KERNEL       -- 5.9.0-rc4-djw

It just survived another 3-hour run for me:

FSTYP         -- xfs (debug)
PLATFORM      -- Linux/x86_64 bobo-kvm 5.9.0-rc4 #40 SMP Tue Sep 22 14:18:21 EDT 2020
MKFS_OPTIONS  -- -f -m reflink=1,rmapbt=1 -i sparse=1 -b size=1024 /dev/sdc
MOUNT_OPTIONS -- /dev/sdc /mnt/scratch

The only warning I hit was in generic/019:

0172 WARNING: CPU: 1 PID: 6933 at fs/iomap/buffered-io.c:997 iomap_page_mkwrite_actor+0x72/0x80

which is the:
                WARN_ON_ONCE(!PageUptodate(page));
that happens as a result of the ClearPageUptodate() in iomap_writepage_map()
which has been happening approximately forever.
Matthew Wilcox Sept. 24, 2020, 1:07 a.m. UTC | #11
On Tue, Sep 22, 2020 at 06:05:26PM +0100, Matthew Wilcox wrote:
> On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote:
> > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote:
> > > Size the uptodate array dynamically to support larger pages in the
> > > page cache.  With a 64kB page, we're only saving 8 bytes per page today,
> > > but with a 2MB maximum page size, we'd have to allocate more than 4kB
> > > per page.  Add a few debugging assertions.
> > > 
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > 
> > Some syscall fuzzing will trigger this on powerpc:
> > 
> > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config
> > 
> > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270
> 
> Well, I'm glad it triggered.  That warning is:
>         WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
>                         PageUptodate(page));
> so there was definitely a problem of some kind.

OK, I'm pretty sure the problem predated this commit, and it's simply
that I added a warning (looking for something else) that caught this.

I have a tree completly gunked up with debugging code now to try to
understand the problem better, but my guess is that if you put this
warning into a previous version, you'd see the same problem occurring
(and it is a real problem, because we skip writeback of parts of the
page which are !uptodate).
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7fc0e02d27b0..9670c096b83e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -22,18 +22,25 @@ 
 #include "../internal.h"
 
 /*
- * Structure allocated for each page when block size < PAGE_SIZE to track
- * sub-page uptodate status and I/O completions.
+ * Structure allocated for each page or THP when block size < page size
+ * to track sub-page uptodate status and I/O completions.
  */
 struct iomap_page {
 	atomic_t		read_count;
 	atomic_t		write_count;
 	spinlock_t		uptodate_lock;
-	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+	unsigned long		uptodate[];
 };
 
 static inline struct iomap_page *to_iomap_page(struct page *page)
 {
+	/*
+	 * per-block data is stored in the head page.  Callers should
+	 * not be dealing with tail pages (and if they are, they can
+	 * call thp_head() first.
+	 */
+	VM_BUG_ON_PGFLAGS(PageTail(page), page);
+
 	if (page_has_private(page))
 		return (struct iomap_page *)page_private(page);
 	return NULL;
@@ -45,11 +52,13 @@  static struct iomap_page *
 iomap_page_create(struct inode *inode, struct page *page)
 {
 	struct iomap_page *iop = to_iomap_page(page);
+	unsigned int nr_blocks = i_blocks_per_page(inode, page);
 
-	if (iop || i_blocks_per_page(inode, page) <= 1)
+	if (iop || nr_blocks <= 1)
 		return iop;
 
-	iop = kzalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
+			GFP_NOFS | __GFP_NOFAIL);
 	spin_lock_init(&iop->uptodate_lock);
 	attach_page_private(page, iop);
 	return iop;
@@ -59,11 +68,14 @@  static void
 iomap_page_release(struct page *page)
 {
 	struct iomap_page *iop = detach_page_private(page);
+	unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
 
 	if (!iop)
 		return;
 	WARN_ON_ONCE(atomic_read(&iop->read_count));
 	WARN_ON_ONCE(atomic_read(&iop->write_count));
+	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
+			PageUptodate(page));
 	kfree(iop);
 }