Message ID | 20200910234707.5504-6-willy@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | THP iomap patches for 5.10 | expand |
On Fri, Sep 11, 2020 at 12:47:03AM +0100, Matthew Wilcox (Oracle) wrote: > Size the uptodate array dynamically to support larger pages in the > page cache. With a 64kB page, we're only saving 8 bytes per page today, > but with a 2MB maximum page size, we'd have to allocate more than 4kB > per page. Add a few debugging assertions. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Reviewed-by: Dave Chinner <dchinner@redhat.com> Looks good, Reviewed-by: Christoph Hellwig <hch@lst.de>
On Fri, Sep 11, 2020 at 12:47:03AM +0100, Matthew Wilcox (Oracle) wrote: > Size the uptodate array dynamically to support larger pages in the > page cache. With a 64kB page, we're only saving 8 bytes per page today, > but with a 2MB maximum page size, we'd have to allocate more than 4kB > per page. Add a few debugging assertions. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Reviewed-by: Dave Chinner <dchinner@redhat.com> Looks ok, Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> --D > --- > fs/iomap/buffered-io.c | 22 +++++++++++++++++----- > 1 file changed, 17 insertions(+), 5 deletions(-) > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 7fc0e02d27b0..9670c096b83e 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -22,18 +22,25 @@ > #include "../internal.h" > > /* > - * Structure allocated for each page when block size < PAGE_SIZE to track > - * sub-page uptodate status and I/O completions. > + * Structure allocated for each page or THP when block size < page size > + * to track sub-page uptodate status and I/O completions. > */ > struct iomap_page { > atomic_t read_count; > atomic_t write_count; > spinlock_t uptodate_lock; > - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); > + unsigned long uptodate[]; > }; > > static inline struct iomap_page *to_iomap_page(struct page *page) > { > + /* > + * per-block data is stored in the head page. Callers should > + * not be dealing with tail pages (and if they are, they can > + * call thp_head() first. > + */ > + VM_BUG_ON_PGFLAGS(PageTail(page), page); > + > if (page_has_private(page)) > return (struct iomap_page *)page_private(page); > return NULL; > @@ -45,11 +52,13 @@ static struct iomap_page * > iomap_page_create(struct inode *inode, struct page *page) > { > struct iomap_page *iop = to_iomap_page(page); > + unsigned int nr_blocks = i_blocks_per_page(inode, page); > > - if (iop || i_blocks_per_page(inode, page) <= 1) > + if (iop || nr_blocks <= 1) > return iop; > > - iop = kzalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); > + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), > + GFP_NOFS | __GFP_NOFAIL); > spin_lock_init(&iop->uptodate_lock); > attach_page_private(page, iop); > return iop; > @@ -59,11 +68,14 @@ static void > iomap_page_release(struct page *page) > { > struct iomap_page *iop = detach_page_private(page); > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > > if (!iop) > return; > WARN_ON_ONCE(atomic_read(&iop->read_count)); > WARN_ON_ONCE(atomic_read(&iop->write_count)); > + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > + PageUptodate(page)); > kfree(iop); > } > > -- > 2.28.0 >
On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > Size the uptodate array dynamically to support larger pages in the > page cache. With a 64kB page, we're only saving 8 bytes per page today, > but with a 2MB maximum page size, we'd have to allocate more than 4kB > per page. Add a few debugging assertions. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Reviewed-by: Dave Chinner <dchinner@redhat.com> Some syscall fuzzing will trigger this on powerpc: .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270 [ 8805.895376][T445431] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_spapr_tce vfio vfio_spapr_eeh loop kvm_hv kvm ip_tables x_tables sd_mod bnx2x tg3 ahci libahci mdio libphy firmware_class libata dm_mirror dm_region_hash dm_log dm_mod [ 8805.895521][T445431] CPU: 61 PID: 445431 Comm: trinity-c61 Not tainted 5.9.0-rc6-next-20200922 #3 [ 8805.895551][T445431] NIP: c0000000004734a0 LR: c00000000047335c CTR: 0000000000000000 [ 8805.895571][T445431] REGS: c000001fe5427620 TRAP: 0700 Not tainted (5.9.0-rc6-next-20200922) [ 8805.895609][T445431] MSR: 9000000000029033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 44002244 XER: 20040000 [ 8805.895671][T445431] CFAR: c000000000473394 IRQMASK: 0 [ 8805.895671][T445431] GPR00: c00000000029b698 c000001fe54278b0 c000000005687e00 0000000000000000 [ 8805.895671][T445431] GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 8805.895671][T445431] GPR08: c00c000002c4b5c7 007fff8000000015 0000000000000001 c000000005f5e028 [ 8805.895671][T445431] GPR12: 0000000000002000 c000001ffffc0580 0000000010037be0 00000000109ec8e0 [ 8805.895671][T445431] GPR16: 00000000109eccb0 c000001fe54279c0 c000001fe5427a40 0000000000000000 [ 8805.895671][T445431] GPR20: c000000000a4aa80 0000000000000008 0000000000000000 000000000000000f [ 8805.895671][T445431] GPR24: ffffffffffffffff c000001fe54279b8 c000001fe5427940 0000000000000001 [ 8805.895671][T445431] GPR28: 0000000000000001 c0000014dd4f9128 0000000000000000 c00c000006273a80 [ 8805.895989][T445431] NIP [c0000000004734a0] iomap_page_release+0x250/0x270 [ 8805.896016][T445431] LR [c00000000047335c] iomap_page_release+0x10c/0x270 [ 8805.896041][T445431] Call Trace: [ 8805.896065][T445431] [c000001fe54278b0] [c000001fe5427940] 0xc000001fe5427940 (unreliable) [ 8805.896101][T445431] [c000001fe54278f0] [c00000000029b698] truncate_cleanup_page+0x188/0x2e0 [ 8805.896144][T445431] [c000001fe5427920] [c00000000029c61c] truncate_inode_pages_range+0x23c/0x9f0 [ 8805.896187][T445431] [c000001fe5427b40] [c00000000029ceec] truncate_pagecache+0x5c/0x90 [ 8805.896229][T445431] [c000001fe5427b80] [c000000000569858] xfs_setattr_size+0xb8/0x5d0 [ 8805.896270][T445431] [c000001fe5427c10] [c00000000056a26c] xfs_vn_setattr+0x8c/0x130 [ 8805.896321][T445431] [c000001fe5427c60] [c0000000003dbe60] notify_change+0x390/0x5b0 [ 8805.896372][T445431] [c000001fe5427cd0] [c0000000003a5294] do_truncate+0x94/0x130 [ 8805.896412][T445431] [c000001fe5427d60] [c0000000003a57a8] do_sys_ftruncate+0xe8/0x160 [ 8805.896455][T445431] [c000001fe5427dc0] [c00000000002a458] system_call_exception+0xf8/0x1d0 [ 8805.896496][T445431] [c000001fe5427e20] [c00000000000d0a8] system_call_common+0xe8/0x218 [ 8805.896544][T445431] Instruction dump: [ 8805.896569][T445431] 3c82fb3a 388490c8 4be65be1 60000000 0fe00000 60000000 60000000 60000000 [ 8805.896623][T445431] 0fe00000 4bfffea8 60000000 60000000 <0fe00000> 4bfffef4 60000000 60000000 [ 8805.896678][T445431] CPU: 61 PID: 445431 Comm: trinity-c61 Not tainted 5.9.0-rc6-next-20200922 #3 [ 8805.896716][T445431] Call Trace: [ 8805.896750][T445431] [c000001fe5427410] [c0000000006440e8] dump_stack+0xec/0x144 (unreliable) [ 8805.896796][T445431] [c000001fe5427450] [c0000000000b0a24] __warn+0xc4/0x144 [ 8805.896837][T445431] [c000001fe54274e0] [c000000000642cf8] report_bug+0x108/0x1f0 [ 8805.896878][T445431] [c000001fe5427580] [c000000000021714] program_check_exception+0x104/0x2e0 [ 8805.896922][T445431] [c000001fe54275b0] [c000000000009664] program_check_common_virt+0x2c4/0x310 [ 8805.896965][T445431] --- interrupt: 700 at iomap_page_release+0x250/0x270 [ 8805.896965][T445431] LR = iomap_page_release+0x10c/0x270 [ 8805.897018][T445431] [c000001fe54278b0] [c000001fe5427940] 0xc000001fe5427940 (unreliable) [ 8805.897053][T445431] [c000001fe54278f0] [c00000000029b698] truncate_cleanup_page+0x188/0x2e0 [ 8805.897104][T445431] [c000001fe5427920] [c00000000029c61c] truncate_inode_pages_range+0x23c/0x9f0 [ 8805.897146][T445431] [c000001fe5427b40] [c00000000029ceec] truncate_pagecache+0x5c/0x90 [ 8805.897186][T445431] [c000001fe5427b80] [c000000000569858] xfs_setattr_size+0xb8/0x5d0 [ 8805.897226][T445431] [c000001fe5427c10] [c00000000056a26c] xfs_vn_setattr+0x8c/0x130 [ 8805.897266][T445431] [c000001fe5427c60] [c0000000003dbe60] notify_change+0x390/0x5b0 [ 8805.897307][T445431] [c000001fe5427cd0] [c0000000003a5294] do_truncate+0x94/0x130 [ 8805.897329][T445431] [c000001fe5427d60] [c0000000003a57a8] do_sys_ftruncate+0xe8/0x160 [ 8805.897369][T445431] [c000001fe5427dc0] [c00000000002a458] system_call_exception+0xf8/0x1d0 [ 8805.897410][T445431] [c000001fe5427e20] [c00000000000d0a8] system_call_common+0xe8/0x218 [ 8805.897448][T445431] irq event stamp: 360936 [ 8805.897475][T445431] hardirqs last enabled at (360935): [<c0000000002bd5b8>] dec_zone_page_state+0x78/0xb0 [ 8805.897515][T445431] hardirqs last disabled at (360936): [<c00000000000965c>] program_check_common_virt+0x2bc/0x310 [ 8805.897556][T445431] softirqs last enabled at (342438): [<c0000000008d3d38>] __do_softirq+0x378/0x6e4 [ 8805.897597][T445431] softirqs last disabled at (342433): [<c0000000000bc838>] irq_exit+0x118/0x140 > --- > fs/iomap/buffered-io.c | 22 +++++++++++++++++----- > 1 file changed, 17 insertions(+), 5 deletions(-) > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 7fc0e02d27b0..9670c096b83e 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -22,18 +22,25 @@ > #include "../internal.h" > > /* > - * Structure allocated for each page when block size < PAGE_SIZE to track > - * sub-page uptodate status and I/O completions. > + * Structure allocated for each page or THP when block size < page size > + * to track sub-page uptodate status and I/O completions. > */ > struct iomap_page { > atomic_t read_count; > atomic_t write_count; > spinlock_t uptodate_lock; > - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); > + unsigned long uptodate[]; > }; > > static inline struct iomap_page *to_iomap_page(struct page *page) > { > + /* > + * per-block data is stored in the head page. Callers should > + * not be dealing with tail pages (and if they are, they can > + * call thp_head() first. > + */ > + VM_BUG_ON_PGFLAGS(PageTail(page), page); > + > if (page_has_private(page)) > return (struct iomap_page *)page_private(page); > return NULL; > @@ -45,11 +52,13 @@ static struct iomap_page * > iomap_page_create(struct inode *inode, struct page *page) > { > struct iomap_page *iop = to_iomap_page(page); > + unsigned int nr_blocks = i_blocks_per_page(inode, page); > > - if (iop || i_blocks_per_page(inode, page) <= 1) > + if (iop || nr_blocks <= 1) > return iop; > > - iop = kzalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); > + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), > + GFP_NOFS | __GFP_NOFAIL); > spin_lock_init(&iop->uptodate_lock); > attach_page_private(page, iop); > return iop; > @@ -59,11 +68,14 @@ static void > iomap_page_release(struct page *page) > { > struct iomap_page *iop = detach_page_private(page); > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > > if (!iop) > return; > WARN_ON_ONCE(atomic_read(&iop->read_count)); > WARN_ON_ONCE(atomic_read(&iop->write_count)); > + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > + PageUptodate(page)); > kfree(iop); > } >
On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > Size the uptodate array dynamically to support larger pages in the > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > per page. Add a few debugging assertions. > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > Some syscall fuzzing will trigger this on powerpc: > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270 Well, I'm glad it triggered. That warning is: WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)); so there was definitely a problem of some kind. truncate_cleanup_page() calls do_invalidatepage() calls iomap_invalidatepage() calls iomap_page_release() Is this the first warning? I'm wondering if maybe there was an I/O error earlier which caused PageUptodate to get cleared again. If it's easy to reproduce, perhaps you could try something like this? +void dump_iomap_page(struct page *page, const char *reason) +{ + struct iomap_page *iop = to_iomap_page(page); + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); + + dump_page(page, reason); + if (iop) + printk("iop:reads %d writes %d uptodate %*pb\n", + atomic_read(&iop->read_bytes_pending), + atomic_read(&iop->write_bytes_pending), + nr_blocks, iop->uptodate); + else + printk("iop:none\n"); +} and then do something like: if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)) dump_iomap_page(page, NULL);
On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote: > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > > Size the uptodate array dynamically to support larger pages in the > > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > > per page. Add a few debugging assertions. > > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > > > Some syscall fuzzing will trigger this on powerpc: > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered- > > io.c:78 iomap_page_release+0x250/0x270 > > Well, I'm glad it triggered. That warning is: > WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > PageUptodate(page)); > so there was definitely a problem of some kind. > > truncate_cleanup_page() calls > do_invalidatepage() calls > iomap_invalidatepage() calls > iomap_page_release() > > Is this the first warning? I'm wondering if maybe there was an I/O error > earlier which caused PageUptodate to get cleared again. If it's easy to > reproduce, perhaps you could try something like this? Yes, this is the first warning. BTW, I did run the reproducer of a805c111650c ("iomap: fix WARN_ON_ONCE() from unprivileged users") earlier, so I am wondering if this is just another victim WARN_ON_ONCE() from it. > > +void dump_iomap_page(struct page *page, const char *reason) > +{ > + struct iomap_page *iop = to_iomap_page(page); > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > + > + dump_page(page, reason); > + if (iop) > + printk("iop:reads %d writes %d uptodate %*pb\n", > + atomic_read(&iop->read_bytes_pending), > + atomic_read(&iop->write_bytes_pending), > + nr_blocks, iop->uptodate); > + else > + printk("iop:none\n"); > +} > > and then do something like: > > if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)) > dump_iomap_page(page, NULL); >
On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote: > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > > Size the uptodate array dynamically to support larger pages in the > > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > > per page. Add a few debugging assertions. > > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > > > Some syscall fuzzing will trigger this on powerpc: > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered- > > io.c:78 iomap_page_release+0x250/0x270 > > Well, I'm glad it triggered. That warning is: > WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > PageUptodate(page)); > so there was definitely a problem of some kind. > > truncate_cleanup_page() calls > do_invalidatepage() calls > iomap_invalidatepage() calls > iomap_page_release() > > Is this the first warning? I'm wondering if maybe there was an I/O error > earlier which caused PageUptodate to get cleared again. If it's easy to > reproduce, perhaps you could try something like this? > > +void dump_iomap_page(struct page *page, const char *reason) > +{ > + struct iomap_page *iop = to_iomap_page(page); > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > + > + dump_page(page, reason); > + if (iop) > + printk("iop:reads %d writes %d uptodate %*pb\n", > + atomic_read(&iop->read_bytes_pending), > + atomic_read(&iop->write_bytes_pending), > + nr_blocks, iop->uptodate); > + else > + printk("iop:none\n"); > +} > > and then do something like: > > if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)) > dump_iomap_page(page, NULL); This: [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2" [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru) [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8 [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000 [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000 [ 1683.158506][T164965] iop:none
On Tue, Sep 22, 2020 at 09:06:03PM -0400, Qian Cai wrote: > On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote: > > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > > > Size the uptodate array dynamically to support larger pages in the > > > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > > > per page. Add a few debugging assertions. > > > > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > > > > > Some syscall fuzzing will trigger this on powerpc: > > > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered- > > > io.c:78 iomap_page_release+0x250/0x270 > > > > Well, I'm glad it triggered. That warning is: > > WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > > PageUptodate(page)); > > so there was definitely a problem of some kind. > > > > truncate_cleanup_page() calls > > do_invalidatepage() calls > > iomap_invalidatepage() calls > > iomap_page_release() > > > > Is this the first warning? I'm wondering if maybe there was an I/O error > > earlier which caused PageUptodate to get cleared again. If it's easy to > > reproduce, perhaps you could try something like this? > > > > +void dump_iomap_page(struct page *page, const char *reason) > > +{ > > + struct iomap_page *iop = to_iomap_page(page); > > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > > + > > + dump_page(page, reason); > > + if (iop) > > + printk("iop:reads %d writes %d uptodate %*pb\n", > > + atomic_read(&iop->read_bytes_pending), > > + atomic_read(&iop->write_bytes_pending), > > + nr_blocks, iop->uptodate); > > + else > > + printk("iop:none\n"); > > +} > > > > and then do something like: > > > > if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)) > > dump_iomap_page(page, NULL); > > This: > > [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c > [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2" > [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru) > [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8 > [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000 > [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000 > [ 1683.158506][T164965] iop:none Oh, I'm a fool. This is after the call to detach_page_private() so page->private is NULL and we don't get the iop dumped. Nevertheless, this is interesting. Somehow, the page is marked Uptodate, but the bitmap is deemed not full. There are three places where we set an iomap page Uptodate: 1. if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) SetPageUptodate(page); 2. if (page_has_private(page)) iomap_iop_set_range_uptodate(page, off, len); else SetPageUptodate(page); 3. BUG_ON(page->index); ... SetPageUptodate(page); It can't be #2 because the page has an iop. It can't be #3 because the page->index is not 0. So at some point in the past, the bitmap was full. I don't think it's possible for inode->i_blksize to change, and you aren't running with THPs, so it's definitely not possible for thp_size() to change. So i_blocks_per_page() isn't going to change. We seem to have allocated enough memory for ->iop because that's also based on i_blocks_per_page(). I'm out of ideas. Maybe I'll wake up with a better idea in the morning. I've been trying to reproduce this on x86 with a 1kB block size filesystem, and haven't been able to yet. Maybe I'll try to setup a powerpc cross-compilation environment tomorrow.
On Wed, Sep 23, 2020 at 03:48:59AM +0100, Matthew Wilcox wrote: > On Tue, Sep 22, 2020 at 09:06:03PM -0400, Qian Cai wrote: > > On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote: > > > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > > > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > > > > Size the uptodate array dynamically to support larger pages in the > > > > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > > > > per page. Add a few debugging assertions. > > > > > > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > > > > > > > Some syscall fuzzing will trigger this on powerpc: > > > > > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > > > > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered- > > > > io.c:78 iomap_page_release+0x250/0x270 > > > > > > Well, I'm glad it triggered. That warning is: > > > WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > > > PageUptodate(page)); > > > so there was definitely a problem of some kind. > > > > > > truncate_cleanup_page() calls > > > do_invalidatepage() calls > > > iomap_invalidatepage() calls > > > iomap_page_release() > > > > > > Is this the first warning? I'm wondering if maybe there was an I/O error > > > earlier which caused PageUptodate to get cleared again. If it's easy to > > > reproduce, perhaps you could try something like this? > > > > > > +void dump_iomap_page(struct page *page, const char *reason) > > > +{ > > > + struct iomap_page *iop = to_iomap_page(page); > > > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > > > + > > > + dump_page(page, reason); > > > + if (iop) > > > + printk("iop:reads %d writes %d uptodate %*pb\n", > > > + atomic_read(&iop->read_bytes_pending), > > > + atomic_read(&iop->write_bytes_pending), > > > + nr_blocks, iop->uptodate); > > > + else > > > + printk("iop:none\n"); > > > +} > > > > > > and then do something like: > > > > > > if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)) > > > dump_iomap_page(page, NULL); > > > > This: > > > > [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c > > [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2" > > [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru) > > [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8 > > [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000 > > [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000 > > [ 1683.158506][T164965] iop:none > > Oh, I'm a fool. This is after the call to detach_page_private() so > page->private is NULL and we don't get the iop dumped. > > Nevertheless, this is interesting. Somehow, the page is marked Uptodate, > but the bitmap is deemed not full. There are three places where we set > an iomap page Uptodate: > > 1. if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) > SetPageUptodate(page); > > 2. if (page_has_private(page)) > iomap_iop_set_range_uptodate(page, off, len); > else > SetPageUptodate(page); > > 3. BUG_ON(page->index); > ... > SetPageUptodate(page); > > It can't be #2 because the page has an iop. It can't be #3 because the > page->index is not 0. So at some point in the past, the bitmap was full. > > I don't think it's possible for inode->i_blksize to change, and you > aren't running with THPs, so it's definitely not possible for thp_size() > to change. So i_blocks_per_page() isn't going to change. > > We seem to have allocated enough memory for ->iop because that's also > based on i_blocks_per_page(). > > I'm out of ideas. Maybe I'll wake up with a better idea in the morning. > I've been trying to reproduce this on x86 with a 1kB block size > filesystem, and haven't been able to yet. Maybe I'll try to setup a > powerpc cross-compilation environment tomorrow. FWIW I managed to reproduce it with the following fstests configuration on a 1k block size fs on a x86 machinE: SECTION -- -no-sections- FSTYP -- xfs MKFS_OPTIONS -- -m reflink=1,rmapbt=1 -i sparse=1 -b size=1024 MOUNT_OPTIONS -- -o usrquota,grpquota,prjquota HOST_OPTIONS -- local.config CHECK_OPTIONS -- -g auto XFS_MKFS_OPTIONS -- -bsize=4096 TIME_FACTOR -- 1 LOAD_FACTOR -- 1 TEST_DIR -- /mnt TEST_DEV -- /dev/sde SCRATCH_DEV -- /dev/sdd SCRATCH_MNT -- /opt OVL_UPPER -- ovl-upper OVL_LOWER -- ovl-lower OVL_WORK -- ovl-work KERNEL -- 5.9.0-rc4-djw The kernel is more or less iomap-for-next. --D
On Wed, 2020-09-23 at 03:48 +0100, Matthew Wilcox wrote: > I'm out of ideas. Maybe I'll wake up with a better idea in the morning. > I've been trying to reproduce this on x86 with a 1kB block size > filesystem, and haven't been able to yet. Maybe I'll try to setup a > powerpc cross-compilation environment tomorrow. Another data point is that this can be reproduced on both arm64 and powerpc (both have 64k-size pages) by running this LTP test case: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/preadv2/preadv203.c [ 2397.723289] preadv203 (80525): drop_caches: 3 [ 2400.796402] XFS (loop0): Unmounting Filesystem [ 2401.431293] ------------[ cut here ]------------ [ 2401.431411] WARNING: CPU: 0 PID: 80501 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270 [ 2401.431435] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_spapr_tce vfio vfio_spapr_eeh loop kvm_hv kvm ip_tables x_tables sd_mod bnx2x mdio ahci libahci tg3 libphy libata firmware_class dm_mirror dm_reg ion_hash dm_log dm_mod [ 2401.431534] CPU: 0 PID: 80501 Comm: preadv203 Not tainted 5.9.0-rc6-next-20200923+ #4 [ 2401.431567] NIP: c000000000473b30 LR: c0000000004739ec CTR: c000000000473e80 [ 2401.431590] REGS: c0000011a7bbf7a0 TRAP: 0700 Not tainted (5.9.0-rc6-next-20200923+) [ 2401.431613] MSR: 9000000000029033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 44000244 XER: 20040000 [ 2401.431655] CFAR: c000000000473a24 IRQMASK: 0 GPR00: c00000000029b6a8 c0000011a7bbfa30 c000000007e87e00 0000000000000005 GPR04: 0000000000000000 0000000000000005 0000000000000005 0000000000000000 GPR08: c00c000806f1f587 087fff8000000037 0000000000000001 c00000000875e028 GPR12: c000000000473e80 c00000000c180000 0000000000000000 0000000000000000 GPR16: 0000000000000000 c0000011a7bbfbb0 c0000011a7bbfbc0 0000000000000000 GPR20: c000000000a4aad8 0000000000000000 0000000000000000 000000000000000f GPR24: ffffffffffffffff c0000011a7bbfb38 c0000011a7bbfac0 0000000000000000 GPR28: 0000000000000001 c000000ea4fe0a28 0000000000000000 c00c0008071b46c0 [ 2401.431856] NIP [c000000000473b30] iomap_page_release+0x250/0x270 [ 2401.431878] LR [c0000000004739ec] iomap_page_release+0x10c/0x270 [ 2401.431899] Call Trace: [ 2401.431926] [c0000011a7bbfa30] [c0000011a7bbfac0] 0xc0000011a7bbfac0 (unreliable) [ 2401.431962] [c0000011a7bbfa70] [c00000000029b6a8] truncate_cleanup_page+0x188/0x2e0 [ 2401.431987] [c0000011a7bbfaa0] [c00000000029c62c] truncate_inode_pages_range+0x23c/0x9f0 [ 2401.432023] [c0000011a7bbfcc0] [c0000000003d9588] evict+0x1e8/0x200 [ 2401.432055] [c0000011a7bbfd00] [c0000000003c64b0] do_unlinkat+0x2d0/0x3a0 [ 2401.432081] [c0000011a7bbfdc0] [c00000000002a458] system_call_exception+0xf8/0x1d0 [ 2401.432097] [c0000011a7bbfe20] [c00000000000d0a8] system_call_common+0xe8/0x218 [ 2401.432120] Instruction dump: [ 2401.432140] 3c82f8ba 388490b8 4be655b1 60000000 0fe00000 60000000 60000000 60000000 [ 2401.432176] 0fe00000 4bfffea8 60000000 60000000 <0fe00000> 4bfffef4 60000000 60000000 [ 2401.432213] CPU: 0 PID: 80501 Comm: preadv203 Not tainted 5.9.0-rc6-next-20200923+ #4 [ 2401.432245] Call Trace: [ 2401.432255] [c0000011a7bbf590] [c000000000644778] dump_stack+0xec/0x144 (unreliable) [ 2401.432284] [c0000011a7bbf5d0] [c0000000000b0a24] __warn+0xc4/0x144 [ 2401.432298] [c0000011a7bbf660] [c000000000643388] report_bug+0x108/0x1f0 [ 2401.432331] [c0000011a7bbf700] [c000000000021714] program_check_exception+0x104/0x2e0 [ 2401.432359] [c0000011a7bbf730] [c000000000009664] program_check_common_virt+0x2c4/0x310 [ 2401.432385] --- interrupt: 700 at iomap_page_release+0x250/0x270 LR = iomap_page_release+0x10c/0x270 [ 2401.432420] [c0000011a7bbfa30] [c0000011a7bbfac0] 0xc0000011a7bbfac0 (unreliable) [ 2401.432446] [c0000011a7bbfa70] [c00000000029b6a8] truncate_cleanup_page+0x188/0x2e0 [ 2401.432470] [c0000011a7bbfaa0] [c00000000029c62c] truncate_inode_pages_range+0x23c/0x9f0 [ 2401.432505] [c0000011a7bbfcc0] [c0000000003d9588] evict+0x1e8/0x200 [ 2401.432528] [c0000011a7bbfd00] [c0000000003c64b0] do_unlinkat+0x2d0/0x3a0 [ 2401.432552] [c0000011a7bbfdc0] [c00000000002a458] system_call_exception+0xf8/0x1d0 [ 2401.432576] [c0000011a7bbfe20] [c00000000000d0a8] system_call_common+0xe8/0x218 [ 2401.432599] irq event stamp: 210662 [ 2401.432619] hardirqs last enabled at (210661): [<c0000000008d32bc>] _raw_spin_unlock_irq+0x3c/0x70 [ 2401.432652] hardirqs last disabled at (210662): [<c00000000000965c>] program_check_common_virt+0x2bc/0x310 [ 2401.432688] softirqs last enabled at (210280): [<c000000000547de4>] xfs_buf_find.isra.31+0xb54/0x1060 [ 2401.432722] softirqs last disabled at (210278): [<c0000000005476fc>] xfs_buf_find.isra.31+0x46c/0x1060
On Tue, Sep 22, 2020 at 10:00:01PM -0700, Darrick J. Wong wrote: > On Wed, Sep 23, 2020 at 03:48:59AM +0100, Matthew Wilcox wrote: > > On Tue, Sep 22, 2020 at 09:06:03PM -0400, Qian Cai wrote: > > > On Tue, 2020-09-22 at 18:05 +0100, Matthew Wilcox wrote: > > > > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > > > > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > > > > > Size the uptodate array dynamically to support larger pages in the > > > > > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > > > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > > > > > per page. Add a few debugging assertions. > > > > > > > > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > > > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > > > > > > > > > Some syscall fuzzing will trigger this on powerpc: > > > > > > > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > > > > > > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered- > > > > > io.c:78 iomap_page_release+0x250/0x270 > > > > > > > > Well, I'm glad it triggered. That warning is: > > > > WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > > > > PageUptodate(page)); > > > > so there was definitely a problem of some kind. > > > > > > > > truncate_cleanup_page() calls > > > > do_invalidatepage() calls > > > > iomap_invalidatepage() calls > > > > iomap_page_release() > > > > > > > > Is this the first warning? I'm wondering if maybe there was an I/O error > > > > earlier which caused PageUptodate to get cleared again. If it's easy to > > > > reproduce, perhaps you could try something like this? > > > > > > > > +void dump_iomap_page(struct page *page, const char *reason) > > > > +{ > > > > + struct iomap_page *iop = to_iomap_page(page); > > > > + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); > > > > + > > > > + dump_page(page, reason); > > > > + if (iop) > > > > + printk("iop:reads %d writes %d uptodate %*pb\n", > > > > + atomic_read(&iop->read_bytes_pending), > > > > + atomic_read(&iop->write_bytes_pending), > > > > + nr_blocks, iop->uptodate); > > > > + else > > > > + printk("iop:none\n"); > > > > +} > > > > > > > > and then do something like: > > > > > > > > if (bitmap_full(iop->uptodate, nr_blocks) != PageUptodate(page)) > > > > dump_iomap_page(page, NULL); > > > > > > This: > > > > > > [ 1683.158254][T164965] page:000000004a6c16cd refcount:2 mapcount:0 mapping:00000000ea017dc5 index:0x2 pfn:0xc365c > > > [ 1683.158311][T164965] aops:xfs_address_space_operations ino:417b7e7 dentry name:"trinity-testfile2" > > > [ 1683.158354][T164965] flags: 0x7fff8000000015(locked|uptodate|lru) > > > [ 1683.158392][T164965] raw: 007fff8000000015 c00c0000019c4b08 c00c0000019a53c8 c000201c8362c1e8 > > > [ 1683.158430][T164965] raw: 0000000000000002 0000000000000000 00000002ffffffff c000201c54db4000 > > > [ 1683.158470][T164965] page->mem_cgroup:c000201c54db4000 > > > [ 1683.158506][T164965] iop:none > > > > Oh, I'm a fool. This is after the call to detach_page_private() so > > page->private is NULL and we don't get the iop dumped. > > > > Nevertheless, this is interesting. Somehow, the page is marked Uptodate, > > but the bitmap is deemed not full. There are three places where we set > > an iomap page Uptodate: > > > > 1. if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) > > SetPageUptodate(page); > > > > 2. if (page_has_private(page)) > > iomap_iop_set_range_uptodate(page, off, len); > > else > > SetPageUptodate(page); > > > > 3. BUG_ON(page->index); > > ... > > SetPageUptodate(page); > > > > It can't be #2 because the page has an iop. It can't be #3 because the > > page->index is not 0. So at some point in the past, the bitmap was full. > > > > I don't think it's possible for inode->i_blksize to change, and you > > aren't running with THPs, so it's definitely not possible for thp_size() > > to change. So i_blocks_per_page() isn't going to change. > > > > We seem to have allocated enough memory for ->iop because that's also > > based on i_blocks_per_page(). > > > > I'm out of ideas. Maybe I'll wake up with a better idea in the morning. > > I've been trying to reproduce this on x86 with a 1kB block size > > filesystem, and haven't been able to yet. Maybe I'll try to setup a > > powerpc cross-compilation environment tomorrow. > > FWIW I managed to reproduce it with the following fstests configuration > on a 1k block size fs on a x86 machinE: > > SECTION -- -no-sections- > FSTYP -- xfs > MKFS_OPTIONS -- -m reflink=1,rmapbt=1 -i sparse=1 -b size=1024 > MOUNT_OPTIONS -- -o usrquota,grpquota,prjquota > HOST_OPTIONS -- local.config > CHECK_OPTIONS -- -g auto > XFS_MKFS_OPTIONS -- -bsize=4096 > TIME_FACTOR -- 1 > LOAD_FACTOR -- 1 > TEST_DIR -- /mnt > TEST_DEV -- /dev/sde > SCRATCH_DEV -- /dev/sdd > SCRATCH_MNT -- /opt > OVL_UPPER -- ovl-upper > OVL_LOWER -- ovl-lower > OVL_WORK -- ovl-work > KERNEL -- 5.9.0-rc4-djw It just survived another 3-hour run for me: FSTYP -- xfs (debug) PLATFORM -- Linux/x86_64 bobo-kvm 5.9.0-rc4 #40 SMP Tue Sep 22 14:18:21 EDT 2020 MKFS_OPTIONS -- -f -m reflink=1,rmapbt=1 -i sparse=1 -b size=1024 /dev/sdc MOUNT_OPTIONS -- /dev/sdc /mnt/scratch The only warning I hit was in generic/019: 0172 WARNING: CPU: 1 PID: 6933 at fs/iomap/buffered-io.c:997 iomap_page_mkwrite_actor+0x72/0x80 which is the: WARN_ON_ONCE(!PageUptodate(page)); that happens as a result of the ClearPageUptodate() in iomap_writepage_map() which has been happening approximately forever.
On Tue, Sep 22, 2020 at 06:05:26PM +0100, Matthew Wilcox wrote: > On Tue, Sep 22, 2020 at 12:23:45PM -0400, Qian Cai wrote: > > On Fri, 2020-09-11 at 00:47 +0100, Matthew Wilcox (Oracle) wrote: > > > Size the uptodate array dynamically to support larger pages in the > > > page cache. With a 64kB page, we're only saving 8 bytes per page today, > > > but with a 2MB maximum page size, we'd have to allocate more than 4kB > > > per page. Add a few debugging assertions. > > > > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > > > Reviewed-by: Dave Chinner <dchinner@redhat.com> > > > > Some syscall fuzzing will trigger this on powerpc: > > > > .config: https://gitlab.com/cailca/linux-mm/-/blob/master/powerpc.config > > > > [ 8805.895344][T445431] WARNING: CPU: 61 PID: 445431 at fs/iomap/buffered-io.c:78 iomap_page_release+0x250/0x270 > > Well, I'm glad it triggered. That warning is: > WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != > PageUptodate(page)); > so there was definitely a problem of some kind. OK, I'm pretty sure the problem predated this commit, and it's simply that I added a warning (looking for something else) that caught this. I have a tree completly gunked up with debugging code now to try to understand the problem better, but my guess is that if you put this warning into a previous version, you'd see the same problem occurring (and it is a real problem, because we skip writeback of parts of the page which are !uptodate).
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7fc0e02d27b0..9670c096b83e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -22,18 +22,25 @@ #include "../internal.h" /* - * Structure allocated for each page when block size < PAGE_SIZE to track - * sub-page uptodate status and I/O completions. + * Structure allocated for each page or THP when block size < page size + * to track sub-page uptodate status and I/O completions. */ struct iomap_page { atomic_t read_count; atomic_t write_count; spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + unsigned long uptodate[]; }; static inline struct iomap_page *to_iomap_page(struct page *page) { + /* + * per-block data is stored in the head page. Callers should + * not be dealing with tail pages (and if they are, they can + * call thp_head() first. + */ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + if (page_has_private(page)) return (struct iomap_page *)page_private(page); return NULL; @@ -45,11 +52,13 @@ static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { struct iomap_page *iop = to_iomap_page(page); + unsigned int nr_blocks = i_blocks_per_page(inode, page); - if (iop || i_blocks_per_page(inode, page) <= 1) + if (iop || nr_blocks <= 1) return iop; - iop = kzalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), + GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); attach_page_private(page, iop); return iop; @@ -59,11 +68,14 @@ static void iomap_page_release(struct page *page) { struct iomap_page *iop = detach_page_private(page); + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_count)); WARN_ON_ONCE(atomic_read(&iop->write_count)); + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + PageUptodate(page)); kfree(iop); }